LLVM 18.1.0rc
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148/// Value type used for condition codes.
149static const MVT MVT_CC = MVT::i32;
150
151static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152 AArch64::X3, AArch64::X4, AArch64::X5,
153 AArch64::X6, AArch64::X7};
154static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155 AArch64::Q3, AArch64::Q4, AArch64::Q5,
156 AArch64::Q6, AArch64::Q7};
157
159
161
162static inline EVT getPackedSVEVectorVT(EVT VT) {
163 switch (VT.getSimpleVT().SimpleTy) {
164 default:
165 llvm_unreachable("unexpected element type for vector");
166 case MVT::i8:
167 return MVT::nxv16i8;
168 case MVT::i16:
169 return MVT::nxv8i16;
170 case MVT::i32:
171 return MVT::nxv4i32;
172 case MVT::i64:
173 return MVT::nxv2i64;
174 case MVT::f16:
175 return MVT::nxv8f16;
176 case MVT::f32:
177 return MVT::nxv4f32;
178 case MVT::f64:
179 return MVT::nxv2f64;
180 case MVT::bf16:
181 return MVT::nxv8bf16;
182 }
183}
184
185// NOTE: Currently there's only a need to return integer vector types. If this
186// changes then just add an extra "type" parameter.
188 switch (EC.getKnownMinValue()) {
189 default:
190 llvm_unreachable("unexpected element count for vector");
191 case 16:
192 return MVT::nxv16i8;
193 case 8:
194 return MVT::nxv8i16;
195 case 4:
196 return MVT::nxv4i32;
197 case 2:
198 return MVT::nxv2i64;
199 }
200}
201
203 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204 "Expected scalable predicate vector type!");
205 switch (VT.getVectorMinNumElements()) {
206 default:
207 llvm_unreachable("unexpected element count for vector");
208 case 2:
209 return MVT::nxv2i64;
210 case 4:
211 return MVT::nxv4i32;
212 case 8:
213 return MVT::nxv8i16;
214 case 16:
215 return MVT::nxv16i8;
216 }
217}
218
219/// Returns true if VT's elements occupy the lowest bit positions of its
220/// associated register class without any intervening space.
221///
222/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223/// same register class, but only nxv8f16 can be treated as a packed vector.
224static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
226 "Expected legal vector type!");
227 return VT.isFixedLengthVector() ||
229}
230
231// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232// predicate and end with a passthru value matching the result type.
233static bool isMergePassthruOpcode(unsigned Opc) {
234 switch (Opc) {
235 default:
236 return false;
266 return true;
267 }
268}
269
270// Returns true if inactive lanes are known to be zeroed by construction.
272 switch (Op.getOpcode()) {
273 default:
274 // We guarantee i1 splat_vectors to zero the other lanes by
275 // implementing it with ptrue and possibly a punpklo for nxv1i1.
277 return true;
278 return false;
281 return true;
283 switch (Op.getConstantOperandVal(0)) {
284 default:
285 return false;
286 case Intrinsic::aarch64_sve_ptrue:
287 case Intrinsic::aarch64_sve_pnext:
288 case Intrinsic::aarch64_sve_cmpeq:
289 case Intrinsic::aarch64_sve_cmpne:
290 case Intrinsic::aarch64_sve_cmpge:
291 case Intrinsic::aarch64_sve_cmpgt:
292 case Intrinsic::aarch64_sve_cmphs:
293 case Intrinsic::aarch64_sve_cmphi:
294 case Intrinsic::aarch64_sve_cmpeq_wide:
295 case Intrinsic::aarch64_sve_cmpne_wide:
296 case Intrinsic::aarch64_sve_cmpge_wide:
297 case Intrinsic::aarch64_sve_cmpgt_wide:
298 case Intrinsic::aarch64_sve_cmplt_wide:
299 case Intrinsic::aarch64_sve_cmple_wide:
300 case Intrinsic::aarch64_sve_cmphs_wide:
301 case Intrinsic::aarch64_sve_cmphi_wide:
302 case Intrinsic::aarch64_sve_cmplo_wide:
303 case Intrinsic::aarch64_sve_cmpls_wide:
304 case Intrinsic::aarch64_sve_fcmpeq:
305 case Intrinsic::aarch64_sve_fcmpne:
306 case Intrinsic::aarch64_sve_fcmpge:
307 case Intrinsic::aarch64_sve_fcmpgt:
308 case Intrinsic::aarch64_sve_fcmpuo:
309 case Intrinsic::aarch64_sve_facgt:
310 case Intrinsic::aarch64_sve_facge:
311 case Intrinsic::aarch64_sve_whilege:
312 case Intrinsic::aarch64_sve_whilegt:
313 case Intrinsic::aarch64_sve_whilehi:
314 case Intrinsic::aarch64_sve_whilehs:
315 case Intrinsic::aarch64_sve_whilele:
316 case Intrinsic::aarch64_sve_whilelo:
317 case Intrinsic::aarch64_sve_whilels:
318 case Intrinsic::aarch64_sve_whilelt:
319 case Intrinsic::aarch64_sve_match:
320 case Intrinsic::aarch64_sve_nmatch:
321 case Intrinsic::aarch64_sve_whilege_x2:
322 case Intrinsic::aarch64_sve_whilegt_x2:
323 case Intrinsic::aarch64_sve_whilehi_x2:
324 case Intrinsic::aarch64_sve_whilehs_x2:
325 case Intrinsic::aarch64_sve_whilele_x2:
326 case Intrinsic::aarch64_sve_whilelo_x2:
327 case Intrinsic::aarch64_sve_whilels_x2:
328 case Intrinsic::aarch64_sve_whilelt_x2:
329 return true;
330 }
331 }
332}
333
335 const AArch64Subtarget &STI)
336 : TargetLowering(TM), Subtarget(&STI) {
337 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
338 // we have to make something up. Arbitrarily, choose ZeroOrOne.
340 // When comparing vectors the result sets the different elements in the
341 // vector to all-one or all-zero.
343
344 // Set up the register classes.
345 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
346 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
347
348 if (Subtarget->hasLS64()) {
349 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
350 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
352 }
353
354 if (Subtarget->hasFPARMv8()) {
355 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
356 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
357 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
358 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
359 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
360 }
361
362 if (Subtarget->hasNEON()) {
363 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
364 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
365 // Someone set us up the NEON.
366 addDRTypeForNEON(MVT::v2f32);
367 addDRTypeForNEON(MVT::v8i8);
368 addDRTypeForNEON(MVT::v4i16);
369 addDRTypeForNEON(MVT::v2i32);
370 addDRTypeForNEON(MVT::v1i64);
371 addDRTypeForNEON(MVT::v1f64);
372 addDRTypeForNEON(MVT::v4f16);
373 if (Subtarget->hasBF16())
374 addDRTypeForNEON(MVT::v4bf16);
375
376 addQRTypeForNEON(MVT::v4f32);
377 addQRTypeForNEON(MVT::v2f64);
378 addQRTypeForNEON(MVT::v16i8);
379 addQRTypeForNEON(MVT::v8i16);
380 addQRTypeForNEON(MVT::v4i32);
381 addQRTypeForNEON(MVT::v2i64);
382 addQRTypeForNEON(MVT::v8f16);
383 if (Subtarget->hasBF16())
384 addQRTypeForNEON(MVT::v8bf16);
385 }
386
387 if (Subtarget->hasSVEorSME()) {
388 // Add legal sve predicate types
389 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
390 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
391 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
392 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
393 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
394
395 // Add legal sve data types
396 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
397 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
398 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
399 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
400
401 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
402 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
403 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
404 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
405 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
406 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
407
408 if (Subtarget->hasBF16()) {
409 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
410 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
411 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
412 }
413
414 if (Subtarget->useSVEForFixedLengthVectors()) {
417 addRegisterClass(VT, &AArch64::ZPRRegClass);
418
421 addRegisterClass(VT, &AArch64::ZPRRegClass);
422 }
423 }
424
425 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
426 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
427 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
428 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
429
430 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
431 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
432 }
433
434 // Compute derived properties from the register classes
436
437 // Provide all sorts of operation actions
474
478
482
484
485 // Custom lowering hooks are needed for XOR
486 // to fold it into CSINC/CSINV.
489
490 // Virtually no operation on f128 is legal, but LLVM can't expand them when
491 // there's a valid register class, so we need custom operations in most cases.
515 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
516 // aren't handled.
517
518 // Lowering for many of the conversions is actually specified by the non-f128
519 // type. The LowerXXX function will be trivial when f128 isn't involved.
550
555
556 // Variable arguments.
561
562 // Variable-sized objects.
565
566 // Lowering Funnel Shifts to EXTR
571
573
574 // Constant pool entries
576
577 // BlockAddress
579
580 // AArch64 lacks both left-rotate and popcount instructions.
586 }
587
588 // AArch64 doesn't have i32 MULH{S|U}.
591
592 // AArch64 doesn't have {U|S}MUL_LOHI.
597
598 if (Subtarget->hasCSSC()) {
602
604
608
611
616
621 } else {
625
628
631 }
632
638 }
645
646 // Custom lower Add/Sub/Mul with overflow.
659
668
677 if (Subtarget->hasFullFP16())
679 else
681
682 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
690 setOperationAction(Op, MVT::f16, Promote);
691 setOperationAction(Op, MVT::v4f16, Expand);
692 setOperationAction(Op, MVT::v8f16, Expand);
693 }
694
695 if (!Subtarget->hasFullFP16()) {
696 for (auto Op :
711 setOperationAction(Op, MVT::f16, Promote);
712
713 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
714 // because the result type is integer.
718 setOperationAction(Op, MVT::f16, Custom);
719
720 // promote v4f16 to v4f32 when that is known to be safe.
721 setOperationPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
722 setOperationPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
723 setOperationPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
724 setOperationPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
725
726 setOperationAction(ISD::FABS, MVT::v4f16, Expand);
727 setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
730 setOperationAction(ISD::FMA, MVT::v4f16, Expand);
742
743 setOperationAction(ISD::FABS, MVT::v8f16, Expand);
744 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
747 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
749 setOperationAction(ISD::FMA, MVT::v8f16, Expand);
750 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
752 setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
757 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
764 }
765
766 // AArch64 has implementations of a lot of rounding-like FP operations.
767 for (auto Op :
778 for (MVT Ty : {MVT::f32, MVT::f64})
780 if (Subtarget->hasFullFP16())
781 setOperationAction(Op, MVT::f16, Legal);
782 }
783
784 // Basic strict FP operations are legal
787 for (MVT Ty : {MVT::f32, MVT::f64})
789 if (Subtarget->hasFullFP16())
790 setOperationAction(Op, MVT::f16, Legal);
791 }
792
793 // Strict conversion to a larger type is legal
794 for (auto VT : {MVT::f32, MVT::f64})
796
798
801
803 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
806 } else {
809 }
812
813 // Generate outline atomics library calls only if LSE was not specified for
814 // subtarget
815 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
841#define LCALLNAMES(A, B, N) \
842 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
843 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
844 setLibcallName(A##N##_REL, #B #N "_rel"); \
845 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
846#define LCALLNAME4(A, B) \
847 LCALLNAMES(A, B, 1) \
848 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
849#define LCALLNAME5(A, B) \
850 LCALLNAMES(A, B, 1) \
851 LCALLNAMES(A, B, 2) \
852 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
853 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
854 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
855 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
856 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
857 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
858 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
859#undef LCALLNAMES
860#undef LCALLNAME4
861#undef LCALLNAME5
862 }
863
864 if (Subtarget->hasLSE128()) {
865 // Custom lowering because i128 is not legal. Must be replaced by 2x64
866 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
870 }
871
872 // 128-bit loads and stores can be done without expanding
875
876 // Aligned 128-bit loads and stores are single-copy atomic according to the
877 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
878 if (Subtarget->hasLSE2()) {
881 }
882
883 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
884 // custom lowering, as there are no un-paired non-temporal stores and
885 // legalization will break up 256 bit inputs.
887 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
888 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
893
894 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
895 // custom lowering, as there are no un-paired non-temporal loads legalization
896 // will break up 256 bit inputs.
897 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
898 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
899 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
900 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
901 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
902 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
903 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
904
905 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
907
908 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
909 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
910 // Issue __sincos_stret if available.
913 } else {
916 }
917
918 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
919 // MSVCRT doesn't have powi; fall back to pow
920 setLibcallName(RTLIB::POWI_F32, nullptr);
921 setLibcallName(RTLIB::POWI_F64, nullptr);
922 }
923
924 // Make floating-point constants legal for the large code model, so they don't
925 // become loads from the constant pool.
926 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
929 }
930
931 // AArch64 does not have floating-point extending loads, i1 sign-extending
932 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
933 for (MVT VT : MVT::fp_valuetypes()) {
934 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
935 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
936 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
937 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
938 }
939 for (MVT VT : MVT::integer_valuetypes())
941
942 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
943 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
944 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
945 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
946 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
947 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
948 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
949
953
954 // Indexed loads and stores are supported.
955 for (unsigned im = (unsigned)ISD::PRE_INC;
957 setIndexedLoadAction(im, MVT::i8, Legal);
958 setIndexedLoadAction(im, MVT::i16, Legal);
959 setIndexedLoadAction(im, MVT::i32, Legal);
960 setIndexedLoadAction(im, MVT::i64, Legal);
961 setIndexedLoadAction(im, MVT::f64, Legal);
962 setIndexedLoadAction(im, MVT::f32, Legal);
963 setIndexedLoadAction(im, MVT::f16, Legal);
964 setIndexedLoadAction(im, MVT::bf16, Legal);
965 setIndexedStoreAction(im, MVT::i8, Legal);
966 setIndexedStoreAction(im, MVT::i16, Legal);
967 setIndexedStoreAction(im, MVT::i32, Legal);
968 setIndexedStoreAction(im, MVT::i64, Legal);
969 setIndexedStoreAction(im, MVT::f64, Legal);
970 setIndexedStoreAction(im, MVT::f32, Legal);
971 setIndexedStoreAction(im, MVT::f16, Legal);
972 setIndexedStoreAction(im, MVT::bf16, Legal);
973 }
974
975 // Trap.
976 setOperationAction(ISD::TRAP, MVT::Other, Legal);
979
980 // We combine OR nodes for bitfield operations.
982 // Try to create BICs for vector ANDs.
984
985 // Vector add and sub nodes may conceal a high-half opportunity.
986 // Also, try to fold ADD into CSINC/CSINV..
989
992
993 // Try and combine setcc with csel
995
997
1004
1006
1008
1010
1014
1016
1018
1020
1022
1026
1028
1029 // In case of strict alignment, avoid an excessive number of byte wide stores.
1032 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1033
1037 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1038
1041
1044 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1045
1047
1049
1050 EnableExtLdPromotion = true;
1051
1052 // Set required alignment.
1054 // Set preferred alignments.
1055
1056 // Don't align loops on Windows. The SEH unwind info generation needs to
1057 // know the exact length of functions before the alignments have been
1058 // expanded.
1059 if (!Subtarget->isTargetWindows())
1063
1064 // Only change the limit for entries in a jump table if specified by
1065 // the sub target, but not at the command line.
1066 unsigned MaxJT = STI.getMaximumJumpTableSize();
1067 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1069
1071
1073
1075
1076 if (Subtarget->hasNEON()) {
1077 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1078 // silliness like this:
1079 for (auto Op :
1094 setOperationAction(Op, MVT::v1f64, Expand);
1095
1096 for (auto Op :
1101 setOperationAction(Op, MVT::v1i64, Expand);
1102
1103 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1104 // elements smaller than i32, so promote the input to i32 first.
1105 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1106 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1107
1108 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1109 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1110 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1113 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1115
1116 if (Subtarget->hasFullFP16()) {
1119
1128 } else {
1129 // when AArch64 doesn't have fullfp16 support, promote the input
1130 // to i32 first.
1131 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1132 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1133 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1134 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1135 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1136 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1137 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1138 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1139 }
1140
1141 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1142 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1149 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1154 }
1155
1156 // Custom handling for some quad-vector types to detect MULL.
1157 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1158 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1159 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1160 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1161 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1162 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1163
1164 // Saturates
1165 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1166 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1171 }
1172
1173 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1174 MVT::v4i32}) {
1181 }
1182
1183 // Vector reductions
1184 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1185 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1186 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1191
1193 }
1194 }
1195 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1196 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1205 }
1210
1212 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1213 // Likewise, narrowing and extending vector loads/stores aren't handled
1214 // directly.
1217
1218 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1221 } else {
1224 }
1227
1230
1231 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1232 setTruncStoreAction(VT, InnerVT, Expand);
1233 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1234 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1235 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1236 }
1237 }
1238
1239 // AArch64 has implementations of a lot of rounding-like FP operations.
1240 for (auto Op :
1245 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1247 if (Subtarget->hasFullFP16())
1248 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1250 }
1251
1252 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1253
1258
1262
1263 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1264 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1265 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1266 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1267 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1268 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1269
1270 // ADDP custom lowering
1271 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1273 // FADDP custom lowering
1274 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1276 }
1277
1278 if (Subtarget->hasSME()) {
1280 }
1281
1282 // FIXME: Move lowering for more nodes here if those are common between
1283 // SVE and SME.
1284 if (Subtarget->hasSVEorSME()) {
1285 for (auto VT :
1286 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1291 }
1292 }
1293
1294 if (Subtarget->hasSVEorSME()) {
1295 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1338
1344
1353
1358
1359 if (!Subtarget->isLittleEndian())
1361
1362 if (Subtarget->hasSVE2orSME())
1363 // For SLI/SRI.
1365 }
1366
1367 // Illegal unpacked integer vector types.
1368 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1371 }
1372
1373 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1374 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1375 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1377
1378 for (auto VT :
1379 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1380 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1382
1383 for (auto VT :
1384 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1392
1396
1397 // There are no legal MVT::nxv16f## based types.
1398 if (VT != MVT::nxv16i1) {
1401 }
1402 }
1403
1404 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1405 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1406 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1407 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1412 }
1413
1414 // Firstly, exclude all scalable vector extending loads/truncating stores,
1415 // include both integer and floating scalable vector.
1417 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1418 setTruncStoreAction(VT, InnerVT, Expand);
1419 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1420 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1421 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1422 }
1423 }
1424
1425 // Then, selectively enable those which we directly support.
1426 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1427 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1428 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1429 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1430 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1431 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1432 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1433 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1434 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1435 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1436 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1437 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1438 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1439 }
1440
1441 // SVE supports truncating stores of 64 and 128-bit vectors
1442 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1443 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1444 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1445 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1446 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1447
1448 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1449 MVT::nxv4f32, MVT::nxv2f64}) {
1485 if (Subtarget->isSVEAvailable())
1490
1504
1516
1517 if (!Subtarget->isLittleEndian())
1519 }
1520
1521 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1528
1529 if (!Subtarget->isLittleEndian())
1531 }
1532
1535
1536 // NEON doesn't support integer divides, but SVE does
1537 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1538 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1541 }
1542
1543 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1544 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1545 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1546
1547 if (Subtarget->isSVEAvailable()) {
1548 // NEON doesn't support across-vector reductions, but SVE does.
1549 for (auto VT :
1550 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1552 }
1553
1554 if (!Subtarget->isNeonAvailable()) {
1555 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Custom);
1556 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Custom);
1557 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Custom);
1558 setTruncStoreAction(MVT::v1f64, MVT::v1f16, Custom);
1559 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Custom);
1560 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Custom);
1561 setTruncStoreAction(MVT::v1f64, MVT::v1f32, Custom);
1562 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Custom);
1563 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
1564 for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1565 MVT::v4i32, MVT::v1i64, MVT::v2i64})
1566 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
1567
1568 for (MVT VT :
1569 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1570 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
1571 }
1572
1573 // NOTE: Currently this has to happen after computeRegisterProperties rather
1574 // than the preferred option of combining it with the addRegisterClass call.
1575 if (Subtarget->useSVEForFixedLengthVectors()) {
1578 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
1581 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
1582
1583 // 64bit results can mean a bigger than NEON input.
1584 for (auto VT : {MVT::v8i8, MVT::v4i16})
1587
1588 // 128bit results imply a bigger than NEON input.
1589 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1591 for (auto VT : {MVT::v8f16, MVT::v4f32})
1593
1594 // These operations are not supported on NEON but SVE can do them.
1596 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1597 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1598 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1599 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1600 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1601 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1602 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1603 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1604 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1605 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1606 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1607 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1608 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1609 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1610 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1615
1616 // Int operations with no NEON support.
1617 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1618 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1626 }
1627
1628
1629 // Use SVE for vectors with more than 2 elements.
1630 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1632 }
1633
1634 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1635 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1636 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1637 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1638
1640 }
1641
1642 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1643 // Only required for llvm.aarch64.mops.memset.tag
1645 }
1646
1648
1649 if (Subtarget->hasSVE()) {
1653 }
1654
1655 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1656
1657 IsStrictFPEnabled = true;
1659
1660 if (Subtarget->isWindowsArm64EC()) {
1661 // FIXME: are there other intrinsics we need to add here?
1662 setLibcallName(RTLIB::MEMCPY, "#memcpy");
1663 setLibcallName(RTLIB::MEMSET, "#memset");
1664 setLibcallName(RTLIB::MEMMOVE, "#memmove");
1665 setLibcallName(RTLIB::REM_F32, "#fmodf");
1666 setLibcallName(RTLIB::REM_F64, "#fmod");
1667 setLibcallName(RTLIB::FMA_F32, "#fmaf");
1668 setLibcallName(RTLIB::FMA_F64, "#fma");
1669 setLibcallName(RTLIB::SQRT_F32, "#sqrtf");
1670 setLibcallName(RTLIB::SQRT_F64, "#sqrt");
1671 setLibcallName(RTLIB::CBRT_F32, "#cbrtf");
1672 setLibcallName(RTLIB::CBRT_F64, "#cbrt");
1673 setLibcallName(RTLIB::LOG_F32, "#logf");
1674 setLibcallName(RTLIB::LOG_F64, "#log");
1675 setLibcallName(RTLIB::LOG2_F32, "#log2f");
1676 setLibcallName(RTLIB::LOG2_F64, "#log2");
1677 setLibcallName(RTLIB::LOG10_F32, "#log10f");
1678 setLibcallName(RTLIB::LOG10_F64, "#log10");
1679 setLibcallName(RTLIB::EXP_F32, "#expf");
1680 setLibcallName(RTLIB::EXP_F64, "#exp");
1681 setLibcallName(RTLIB::EXP2_F32, "#exp2f");
1682 setLibcallName(RTLIB::EXP2_F64, "#exp2");
1683 setLibcallName(RTLIB::EXP10_F32, "#exp10f");
1684 setLibcallName(RTLIB::EXP10_F64, "#exp10");
1685 setLibcallName(RTLIB::SIN_F32, "#sinf");
1686 setLibcallName(RTLIB::SIN_F64, "#sin");
1687 setLibcallName(RTLIB::COS_F32, "#cosf");
1688 setLibcallName(RTLIB::COS_F64, "#cos");
1689 setLibcallName(RTLIB::POW_F32, "#powf");
1690 setLibcallName(RTLIB::POW_F64, "#pow");
1691 setLibcallName(RTLIB::LDEXP_F32, "#ldexpf");
1692 setLibcallName(RTLIB::LDEXP_F64, "#ldexp");
1693 setLibcallName(RTLIB::FREXP_F32, "#frexpf");
1694 setLibcallName(RTLIB::FREXP_F64, "#frexp");
1695 }
1696}
1697
1698void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1699 assert(VT.isVector() && "VT should be a vector type");
1700
1701 if (VT.isFloatingPoint()) {
1703 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1704 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1705 }
1706
1707 // Mark vector float intrinsics as expand.
1708 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1718 }
1719
1720 // But we do support custom-lowering for FCOPYSIGN.
1721 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1722 ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1724
1737
1741 for (MVT InnerVT : MVT::all_valuetypes())
1742 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1743
1744 // CNT supports only B element sizes, then use UADDLP to widen.
1745 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1747
1753
1754 for (unsigned Opcode :
1757 setOperationAction(Opcode, VT, Custom);
1758
1759 if (!VT.isFloatingPoint())
1761
1762 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1763 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1764 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1765 setOperationAction(Opcode, VT, Legal);
1766
1767 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1768 // NEON types.
1769 if (VT.isFloatingPoint() &&
1770 VT.getVectorElementType() != MVT::bf16 &&
1771 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1772 for (unsigned Opcode :
1778 setOperationAction(Opcode, VT, Legal);
1779
1780 // Strict fp extend and trunc are legal
1781 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1783 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1785
1786 // FIXME: We could potentially make use of the vector comparison instructions
1787 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1788 // complications:
1789 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1790 // so we would need to expand when the condition code doesn't match the
1791 // kind of comparison.
1792 // * Some kinds of comparison require more than one FCMXY instruction so
1793 // would need to be expanded instead.
1794 // * The lowering of the non-strict versions involves target-specific ISD
1795 // nodes so we would likely need to add strict versions of all of them and
1796 // handle them appropriately.
1799
1800 if (Subtarget->isLittleEndian()) {
1801 for (unsigned im = (unsigned)ISD::PRE_INC;
1805 }
1806 }
1807
1808 if (Subtarget->hasD128()) {
1811 }
1812}
1813
1815 EVT OpVT) const {
1816 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1817 if (!Subtarget->hasSVE())
1818 return true;
1819
1820 // We can only support legal predicate result types. We can use the SVE
1821 // whilelo instruction for generating fixed-width predicates too.
1822 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1823 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1824 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1825 return true;
1826
1827 // The whilelo instruction only works with i32 or i64 scalar inputs.
1828 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1829 return true;
1830
1831 return false;
1832}
1833
1835 return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1836}
1837
1838void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT,
1839 bool StreamingSVE) {
1840 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1841
1842 // By default everything must be expanded.
1843 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1845
1846 if (VT.isFloatingPoint()) {
1856 }
1857
1858 // Mark integer truncating stores/extending loads as having custom lowering
1859 if (VT.isInteger()) {
1860 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1861 while (InnerVT != VT) {
1862 setTruncStoreAction(VT, InnerVT, Custom);
1863 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1864 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1865 InnerVT = InnerVT.changeVectorElementType(
1866 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1867 }
1868 }
1869
1870 // Mark floating-point truncating stores/extending loads as having custom
1871 // lowering
1872 if (VT.isFloatingPoint()) {
1873 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1874 while (InnerVT != VT) {
1875 setTruncStoreAction(VT, InnerVT, Custom);
1876 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1877 InnerVT = InnerVT.changeVectorElementType(
1879 }
1880 }
1881
1882 // Lower fixed length vector operations to scalable equivalents.
1887 setOperationAction(ISD::BITCAST, VT, StreamingSVE ? Legal : Custom);
1922 setOperationAction(ISD::LOAD, VT, StreamingSVE ? Legal : Custom);
1923 setOperationAction(ISD::MGATHER, VT, StreamingSVE ? Expand : Custom);
1925 setOperationAction(ISD::MSCATTER, VT, StreamingSVE ? Expand : Custom);
1944 setOperationAction(ISD::STORE, VT, StreamingSVE ? Legal : Custom);
1960 StreamingSVE ? Expand : Custom);
1971}
1972
1973void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1974 addRegisterClass(VT, &AArch64::FPR64RegClass);
1975 addTypeForNEON(VT);
1976}
1977
1978void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1979 addRegisterClass(VT, &AArch64::FPR128RegClass);
1980 addTypeForNEON(VT);
1981}
1982
1984 LLVMContext &C, EVT VT) const {
1985 if (!VT.isVector())
1986 return MVT::i32;
1987 if (VT.isScalableVector())
1988 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
1990}
1991
1992// isIntImmediate - This method tests to see if the node is a constant
1993// operand. If so Imm will receive the value.
1994static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
1995 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
1996 Imm = C->getZExtValue();
1997 return true;
1998 }
1999 return false;
2000}
2001
2002// isOpcWithIntImmediate - This method tests to see if the node is a specific
2003// opcode and that it has a immediate integer right operand.
2004// If so Imm will receive the value.
2005static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2006 uint64_t &Imm) {
2007 return N->getOpcode() == Opc &&
2008 isIntImmediate(N->getOperand(1).getNode(), Imm);
2009}
2010
2011static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2012 const APInt &Demanded,
2014 unsigned NewOpc) {
2015 uint64_t OldImm = Imm, NewImm, Enc;
2016 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2017
2018 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2019 // bimm64.
2020 if (Imm == 0 || Imm == Mask ||
2022 return false;
2023
2024 unsigned EltSize = Size;
2025 uint64_t DemandedBits = Demanded.getZExtValue();
2026
2027 // Clear bits that are not demanded.
2028 Imm &= DemandedBits;
2029
2030 while (true) {
2031 // The goal here is to set the non-demanded bits in a way that minimizes
2032 // the number of switching between 0 and 1. In order to achieve this goal,
2033 // we set the non-demanded bits to the value of the preceding demanded bits.
2034 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2035 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2036 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2037 // The final result is 0b11000011.
2038 uint64_t NonDemandedBits = ~DemandedBits;
2039 uint64_t InvertedImm = ~Imm & DemandedBits;
2040 uint64_t RotatedImm =
2041 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2042 NonDemandedBits;
2043 uint64_t Sum = RotatedImm + NonDemandedBits;
2044 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2045 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2046 NewImm = (Imm | Ones) & Mask;
2047
2048 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2049 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2050 // we halve the element size and continue the search.
2051 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2052 break;
2053
2054 // We cannot shrink the element size any further if it is 2-bits.
2055 if (EltSize == 2)
2056 return false;
2057
2058 EltSize /= 2;
2059 Mask >>= EltSize;
2060 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2061
2062 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2063 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2064 return false;
2065
2066 // Merge the upper and lower halves of Imm and DemandedBits.
2067 Imm |= Hi;
2068 DemandedBits |= DemandedBitsHi;
2069 }
2070
2071 ++NumOptimizedImms;
2072
2073 // Replicate the element across the register width.
2074 while (EltSize < Size) {
2075 NewImm |= NewImm << EltSize;
2076 EltSize *= 2;
2077 }
2078
2079 (void)OldImm;
2080 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2081 "demanded bits should never be altered");
2082 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2083
2084 // Create the new constant immediate node.
2085 EVT VT = Op.getValueType();
2086 SDLoc DL(Op);
2087 SDValue New;
2088
2089 // If the new constant immediate is all-zeros or all-ones, let the target
2090 // independent DAG combine optimize this node.
2091 if (NewImm == 0 || NewImm == OrigMask) {
2092 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2093 TLO.DAG.getConstant(NewImm, DL, VT));
2094 // Otherwise, create a machine node so that target independent DAG combine
2095 // doesn't undo this optimization.
2096 } else {
2098 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2099 New = SDValue(
2100 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2101 }
2102
2103 return TLO.CombineTo(Op, New);
2104}
2105
2107 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2108 TargetLoweringOpt &TLO) const {
2109 // Delay this optimization to as late as possible.
2110 if (!TLO.LegalOps)
2111 return false;
2112
2114 return false;
2115
2116 EVT VT = Op.getValueType();
2117 if (VT.isVector())
2118 return false;
2119
2120 unsigned Size = VT.getSizeInBits();
2121 assert((Size == 32 || Size == 64) &&
2122 "i32 or i64 is expected after legalization.");
2123
2124 // Exit early if we demand all bits.
2125 if (DemandedBits.popcount() == Size)
2126 return false;
2127
2128 unsigned NewOpc;
2129 switch (Op.getOpcode()) {
2130 default:
2131 return false;
2132 case ISD::AND:
2133 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2134 break;
2135 case ISD::OR:
2136 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2137 break;
2138 case ISD::XOR:
2139 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2140 break;
2141 }
2142 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2143 if (!C)
2144 return false;
2145 uint64_t Imm = C->getZExtValue();
2146 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2147}
2148
2149/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2150/// Mask are known to be either zero or one and return them Known.
2152 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2153 const SelectionDAG &DAG, unsigned Depth) const {
2154 switch (Op.getOpcode()) {
2155 default:
2156 break;
2157 case AArch64ISD::DUP: {
2158 SDValue SrcOp = Op.getOperand(0);
2159 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2160 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2161 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2162 "Expected DUP implicit truncation");
2163 Known = Known.trunc(Op.getScalarValueSizeInBits());
2164 }
2165 break;
2166 }
2167 case AArch64ISD::CSEL: {
2168 KnownBits Known2;
2169 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2170 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2171 Known = Known.intersectWith(Known2);
2172 break;
2173 }
2174 case AArch64ISD::BICi: {
2175 // Compute the bit cleared value.
2176 uint64_t Mask =
2177 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2178 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2179 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2180 break;
2181 }
2182 case AArch64ISD::VLSHR: {
2183 KnownBits Known2;
2184 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2185 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2186 Known = KnownBits::lshr(Known, Known2);
2187 break;
2188 }
2189 case AArch64ISD::VASHR: {
2190 KnownBits Known2;
2191 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2192 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2193 Known = KnownBits::ashr(Known, Known2);
2194 break;
2195 }
2196 case AArch64ISD::VSHL: {
2197 KnownBits Known2;
2198 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2199 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2200 Known = KnownBits::shl(Known, Known2);
2201 break;
2202 }
2203 case AArch64ISD::MOVI: {
2205 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2206 break;
2207 }
2209 case AArch64ISD::ADDlow: {
2210 if (!Subtarget->isTargetILP32())
2211 break;
2212 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2213 Known.Zero = APInt::getHighBitsSet(64, 32);
2214 break;
2215 }
2217 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2218 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2219 break;
2220 }
2222 Intrinsic::ID IntID =
2223 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2224 switch (IntID) {
2225 default: return;
2226 case Intrinsic::aarch64_ldaxr:
2227 case Intrinsic::aarch64_ldxr: {
2228 unsigned BitWidth = Known.getBitWidth();
2229 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2230 unsigned MemBits = VT.getScalarSizeInBits();
2231 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2232 return;
2233 }
2234 }
2235 break;
2236 }
2238 case ISD::INTRINSIC_VOID: {
2239 unsigned IntNo = Op.getConstantOperandVal(0);
2240 switch (IntNo) {
2241 default:
2242 break;
2243 case Intrinsic::aarch64_neon_uaddlv: {
2244 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2245 unsigned BitWidth = Known.getBitWidth();
2246 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2247 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2248 assert(BitWidth >= Bound && "Unexpected width!");
2250 Known.Zero |= Mask;
2251 }
2252 break;
2253 }
2254 case Intrinsic::aarch64_neon_umaxv:
2255 case Intrinsic::aarch64_neon_uminv: {
2256 // Figure out the datatype of the vector operand. The UMINV instruction
2257 // will zero extend the result, so we can mark as known zero all the
2258 // bits larger than the element datatype. 32-bit or larget doesn't need
2259 // this as those are legal types and will be handled by isel directly.
2260 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2261 unsigned BitWidth = Known.getBitWidth();
2262 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2263 assert(BitWidth >= 8 && "Unexpected width!");
2265 Known.Zero |= Mask;
2266 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2267 assert(BitWidth >= 16 && "Unexpected width!");
2269 Known.Zero |= Mask;
2270 }
2271 break;
2272 } break;
2273 }
2274 }
2275 }
2276}
2277
2279 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2280 unsigned Depth) const {
2281 EVT VT = Op.getValueType();
2282 unsigned VTBits = VT.getScalarSizeInBits();
2283 unsigned Opcode = Op.getOpcode();
2284 switch (Opcode) {
2285 case AArch64ISD::CMEQ:
2286 case AArch64ISD::CMGE:
2287 case AArch64ISD::CMGT:
2288 case AArch64ISD::CMHI:
2289 case AArch64ISD::CMHS:
2290 case AArch64ISD::FCMEQ:
2291 case AArch64ISD::FCMGE:
2292 case AArch64ISD::FCMGT:
2293 case AArch64ISD::CMEQz:
2294 case AArch64ISD::CMGEz:
2295 case AArch64ISD::CMGTz:
2296 case AArch64ISD::CMLEz:
2297 case AArch64ISD::CMLTz:
2298 case AArch64ISD::FCMEQz:
2299 case AArch64ISD::FCMGEz:
2300 case AArch64ISD::FCMGTz:
2301 case AArch64ISD::FCMLEz:
2302 case AArch64ISD::FCMLTz:
2303 // Compares return either 0 or all-ones
2304 return VTBits;
2305 }
2306
2307 return 1;
2308}
2309
2311 EVT) const {
2312 return MVT::i64;
2313}
2314
2316 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2317 unsigned *Fast) const {
2318 if (Subtarget->requiresStrictAlign())
2319 return false;
2320
2321 if (Fast) {
2322 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2323 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2324 // See comments in performSTORECombine() for more details about
2325 // these conditions.
2326
2327 // Code that uses clang vector extensions can mark that it
2328 // wants unaligned accesses to be treated as fast by
2329 // underspecifying alignment to be 1 or 2.
2330 Alignment <= 2 ||
2331
2332 // Disregard v2i64. Memcpy lowering produces those and splitting
2333 // them regresses performance on micro-benchmarks and olden/bh.
2334 VT == MVT::v2i64;
2335 }
2336 return true;
2337}
2338
2339// Same as above but handling LLTs instead.
2341 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2342 unsigned *Fast) const {
2343 if (Subtarget->requiresStrictAlign())
2344 return false;
2345
2346 if (Fast) {
2347 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2348 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2349 Ty.getSizeInBytes() != 16 ||
2350 // See comments in performSTORECombine() for more details about
2351 // these conditions.
2352
2353 // Code that uses clang vector extensions can mark that it
2354 // wants unaligned accesses to be treated as fast by
2355 // underspecifying alignment to be 1 or 2.
2356 Alignment <= 2 ||
2357
2358 // Disregard v2i64. Memcpy lowering produces those and splitting
2359 // them regresses performance on micro-benchmarks and olden/bh.
2360 Ty == LLT::fixed_vector(2, 64);
2361 }
2362 return true;
2363}
2364
2365FastISel *
2367 const TargetLibraryInfo *libInfo) const {
2368 return AArch64::createFastISel(funcInfo, libInfo);
2369}
2370
2371const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2372#define MAKE_CASE(V) \
2373 case V: \
2374 return #V;
2375 switch ((AArch64ISD::NodeType)Opcode) {
2377 break;
2693 }
2694#undef MAKE_CASE
2695 return nullptr;
2696}
2697
2700 MachineBasicBlock *MBB) const {
2701 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2702 // phi node:
2703
2704 // OrigBB:
2705 // [... previous instrs leading to comparison ...]
2706 // b.ne TrueBB
2707 // b EndBB
2708 // TrueBB:
2709 // ; Fallthrough
2710 // EndBB:
2711 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2712
2713 MachineFunction *MF = MBB->getParent();
2714 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2715 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2716 DebugLoc DL = MI.getDebugLoc();
2718
2719 Register DestReg = MI.getOperand(0).getReg();
2720 Register IfTrueReg = MI.getOperand(1).getReg();
2721 Register IfFalseReg = MI.getOperand(2).getReg();
2722 unsigned CondCode = MI.getOperand(3).getImm();
2723 bool NZCVKilled = MI.getOperand(4).isKill();
2724
2725 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2726 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2727 MF->insert(It, TrueBB);
2728 MF->insert(It, EndBB);
2729
2730 // Transfer rest of current basic-block to EndBB
2731 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2732 MBB->end());
2734
2735 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2736 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2737 MBB->addSuccessor(TrueBB);
2738 MBB->addSuccessor(EndBB);
2739
2740 // TrueBB falls through to the end.
2741 TrueBB->addSuccessor(EndBB);
2742
2743 if (!NZCVKilled) {
2744 TrueBB->addLiveIn(AArch64::NZCV);
2745 EndBB->addLiveIn(AArch64::NZCV);
2746 }
2747
2748 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2749 .addReg(IfTrueReg)
2750 .addMBB(TrueBB)
2751 .addReg(IfFalseReg)
2752 .addMBB(MBB);
2753
2754 MI.eraseFromParent();
2755 return EndBB;
2756}
2757
2759 MachineInstr &MI, MachineBasicBlock *BB) const {
2761 BB->getParent()->getFunction().getPersonalityFn())) &&
2762 "SEH does not use catchret!");
2763 return BB;
2764}
2765
2768 MachineBasicBlock *MBB) const {
2769 MachineFunction &MF = *MBB->getParent();
2770 MachineBasicBlock::iterator MBBI = MI.getIterator();
2772 const AArch64InstrInfo &TII =
2773 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2774 Register TargetReg = MI.getOperand(0).getReg();
2776 TII.probedStackAlloc(MBBI, TargetReg, false);
2777
2778 MI.eraseFromParent();
2779 return NextInst->getParent();
2780}
2781
2783AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2785 MachineBasicBlock *BB) const {
2786 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2787 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2788
2789 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2790 MIB.add(MI.getOperand(1)); // slice index register
2791 MIB.add(MI.getOperand(2)); // slice index offset
2792 MIB.add(MI.getOperand(3)); // pg
2793 MIB.add(MI.getOperand(4)); // base
2794 MIB.add(MI.getOperand(5)); // offset
2795
2796 MI.eraseFromParent(); // The pseudo is gone now.
2797 return BB;
2798}
2799
2802 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2804 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2805
2806 MIB.addReg(AArch64::ZA, RegState::Define);
2807 MIB.add(MI.getOperand(0)); // Vector select register
2808 MIB.add(MI.getOperand(1)); // Vector select offset
2809 MIB.add(MI.getOperand(2)); // Base
2810 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2811
2812 MI.eraseFromParent(); // The pseudo is gone now.
2813 return BB;
2814}
2815
2818 unsigned Opcode,
2819 bool Op0IsDef) const {
2820 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2822
2823 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2824 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2825 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2826 MIB.add(MI.getOperand(I));
2827
2828 MI.eraseFromParent(); // The pseudo is gone now.
2829 return BB;
2830}
2831
2833AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2835 MachineBasicBlock *BB, bool HasTile) const {
2836 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2837 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2838 unsigned StartIdx = 0;
2839
2840 if (HasTile) {
2841 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2842 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2843 StartIdx = 1;
2844 } else
2845 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2846
2847 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2848 MIB.add(MI.getOperand(I));
2849
2850 MI.eraseFromParent(); // The pseudo is gone now.
2851 return BB;
2852}
2853
2856 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2858 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2859 MIB.add(MI.getOperand(0)); // Mask
2860
2861 unsigned Mask = MI.getOperand(0).getImm();
2862 for (unsigned I = 0; I < 8; I++) {
2863 if (Mask & (1 << I))
2864 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2865 }
2866
2867 MI.eraseFromParent(); // The pseudo is gone now.
2868 return BB;
2869}
2870
2872 MachineInstr &MI, MachineBasicBlock *BB) const {
2873
2874 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2875 if (SMEOrigInstr != -1) {
2876 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2877 uint64_t SMEMatrixType =
2878 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2879 switch (SMEMatrixType) {
2881 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2883 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2885 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2887 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2889 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2891 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2892 }
2893 }
2894
2895 switch (MI.getOpcode()) {
2896 default:
2897#ifndef NDEBUG
2898 MI.dump();
2899#endif
2900 llvm_unreachable("Unexpected instruction for custom inserter!");
2901
2902 case AArch64::F128CSEL:
2903 return EmitF128CSEL(MI, BB);
2904 case TargetOpcode::STATEPOINT:
2905 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2906 // while bl call instruction (where statepoint will be lowered at the end)
2907 // has implicit def. This def is early-clobber as it will be set at
2908 // the moment of the call and earlier than any use is read.
2909 // Add this implicit dead def here as a workaround.
2910 MI.addOperand(*MI.getMF(),
2912 AArch64::LR, /*isDef*/ true,
2913 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2914 /*isUndef*/ false, /*isEarlyClobber*/ true));
2915 [[fallthrough]];
2916 case TargetOpcode::STACKMAP:
2917 case TargetOpcode::PATCHPOINT:
2918 return emitPatchPoint(MI, BB);
2919
2920 case TargetOpcode::PATCHABLE_EVENT_CALL:
2921 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2922 return BB;
2923
2924 case AArch64::CATCHRET:
2925 return EmitLoweredCatchRet(MI, BB);
2926
2927 case AArch64::PROBED_STACKALLOC_DYN:
2928 return EmitDynamicProbedAlloc(MI, BB);
2929
2930 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2931 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2932 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2933 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2934 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2935 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2936 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2937 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2938 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2939 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2940 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2941 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2942 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2943 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2944 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2945 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2946 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2947 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2948 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2949 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2950 case AArch64::LDR_ZA_PSEUDO:
2951 return EmitFill(MI, BB);
2952 case AArch64::LDR_TX_PSEUDO:
2953 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
2954 case AArch64::STR_TX_PSEUDO:
2955 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
2956 case AArch64::ZERO_M_PSEUDO:
2957 return EmitZero(MI, BB);
2958 case AArch64::ZERO_T_PSEUDO:
2959 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
2960 }
2961}
2962
2963//===----------------------------------------------------------------------===//
2964// AArch64 Lowering private implementation.
2965//===----------------------------------------------------------------------===//
2966
2967//===----------------------------------------------------------------------===//
2968// Lowering Code
2969//===----------------------------------------------------------------------===//
2970
2971// Forward declarations of SVE fixed length lowering helpers
2976 SelectionDAG &DAG);
2978 EVT VT);
2979
2980/// isZerosVector - Check whether SDNode N is a zero-filled vector.
2981static bool isZerosVector(const SDNode *N) {
2982 // Look through a bit convert.
2983 while (N->getOpcode() == ISD::BITCAST)
2984 N = N->getOperand(0).getNode();
2985
2987 return true;
2988
2989 if (N->getOpcode() != AArch64ISD::DUP)
2990 return false;
2991
2992 auto Opnd0 = N->getOperand(0);
2993 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
2994}
2995
2996/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2997/// CC
2999 switch (CC) {
3000 default:
3001 llvm_unreachable("Unknown condition code!");
3002 case ISD::SETNE:
3003 return AArch64CC::NE;
3004 case ISD::SETEQ:
3005 return AArch64CC::EQ;
3006 case ISD::SETGT:
3007 return AArch64CC::GT;
3008 case ISD::SETGE:
3009 return AArch64CC::GE;
3010 case ISD::SETLT:
3011 return AArch64CC::LT;
3012 case ISD::SETLE:
3013 return AArch64CC::LE;
3014 case ISD::SETUGT:
3015 return AArch64CC::HI;
3016 case ISD::SETUGE:
3017 return AArch64CC::HS;
3018 case ISD::SETULT:
3019 return AArch64CC::LO;
3020 case ISD::SETULE:
3021 return AArch64CC::LS;
3022 }
3023}
3024
3025/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3027 AArch64CC::CondCode &CondCode,
3028 AArch64CC::CondCode &CondCode2) {
3029 CondCode2 = AArch64CC::AL;
3030 switch (CC) {
3031 default:
3032 llvm_unreachable("Unknown FP condition!");
3033 case ISD::SETEQ:
3034 case ISD::SETOEQ:
3035 CondCode = AArch64CC::EQ;
3036 break;
3037 case ISD::SETGT:
3038 case ISD::SETOGT:
3039 CondCode = AArch64CC::GT;
3040 break;
3041 case ISD::SETGE:
3042 case ISD::SETOGE:
3043 CondCode = AArch64CC::GE;
3044 break;
3045 case ISD::SETOLT:
3046 CondCode = AArch64CC::MI;
3047 break;
3048 case ISD::SETOLE:
3049 CondCode = AArch64CC::LS;
3050 break;
3051 case ISD::SETONE:
3052 CondCode = AArch64CC::MI;
3053 CondCode2 = AArch64CC::GT;
3054 break;
3055 case ISD::SETO:
3056 CondCode = AArch64CC::VC;
3057 break;
3058 case ISD::SETUO:
3059 CondCode = AArch64CC::VS;
3060 break;
3061 case ISD::SETUEQ:
3062 CondCode = AArch64CC::EQ;
3063 CondCode2 = AArch64CC::VS;
3064 break;
3065 case ISD::SETUGT:
3066 CondCode = AArch64CC::HI;
3067 break;
3068 case ISD::SETUGE:
3069 CondCode = AArch64CC::PL;
3070 break;
3071 case ISD::SETLT:
3072 case ISD::SETULT:
3073 CondCode = AArch64CC::LT;
3074 break;
3075 case ISD::SETLE:
3076 case ISD::SETULE:
3077 CondCode = AArch64CC::LE;
3078 break;
3079 case ISD::SETNE:
3080 case ISD::SETUNE:
3081 CondCode = AArch64CC::NE;
3082 break;
3083 }
3084}
3085
3086/// Convert a DAG fp condition code to an AArch64 CC.
3087/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3088/// should be AND'ed instead of OR'ed.
3090 AArch64CC::CondCode &CondCode,
3091 AArch64CC::CondCode &CondCode2) {
3092 CondCode2 = AArch64CC::AL;
3093 switch (CC) {
3094 default:
3095 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3096 assert(CondCode2 == AArch64CC::AL);
3097 break;
3098 case ISD::SETONE:
3099 // (a one b)
3100 // == ((a olt b) || (a ogt b))
3101 // == ((a ord b) && (a une b))
3102 CondCode = AArch64CC::VC;
3103 CondCode2 = AArch64CC::NE;
3104 break;
3105 case ISD::SETUEQ:
3106 // (a ueq b)
3107 // == ((a uno b) || (a oeq b))
3108 // == ((a ule b) && (a uge b))
3109 CondCode = AArch64CC::PL;
3110 CondCode2 = AArch64CC::LE;
3111 break;
3112 }
3113}
3114
3115/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3116/// CC usable with the vector instructions. Fewer operations are available
3117/// without a real NZCV register, so we have to use less efficient combinations
3118/// to get the same effect.
3120 AArch64CC::CondCode &CondCode,
3121 AArch64CC::CondCode &CondCode2,
3122 bool &Invert) {
3123 Invert = false;
3124 switch (CC) {
3125 default:
3126 // Mostly the scalar mappings work fine.
3127 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3128 break;
3129 case ISD::SETUO:
3130 Invert = true;
3131 [[fallthrough]];
3132 case ISD::SETO:
3133 CondCode = AArch64CC::MI;
3134 CondCode2 = AArch64CC::GE;
3135 break;
3136 case ISD::SETUEQ:
3137 case ISD::SETULT:
3138 case ISD::SETULE:
3139 case ISD::SETUGT:
3140 case ISD::SETUGE:
3141 // All of the compare-mask comparisons are ordered, but we can switch
3142 // between the two by a double inversion. E.g. ULE == !OGT.
3143 Invert = true;
3144 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3145 CondCode, CondCode2);
3146 break;
3147 }
3148}
3149
3151 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3152 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3153 LLVM_DEBUG(dbgs() << "Is imm " << C
3154 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3155 return IsLegal;
3156}
3157
3158// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3159// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3160// can be set differently by this operation. It comes down to whether
3161// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3162// everything is fine. If not then the optimization is wrong. Thus general
3163// comparisons are only valid if op2 != 0.
3164//
3165// So, finally, the only LLVM-native comparisons that don't mention C and V
3166// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3167// the absence of information about op2.
3169 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3170 (CC == ISD::SETEQ || CC == ISD::SETNE);
3171}
3172
3174 SelectionDAG &DAG, SDValue Chain,
3175 bool IsSignaling) {
3176 EVT VT = LHS.getValueType();
3177 assert(VT != MVT::f128);
3178
3179 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3180
3181 if (VT == MVT::f16 && !FullFP16) {
3182 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3183 {Chain, LHS});
3184 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3185 {LHS.getValue(1), RHS});
3186 Chain = RHS.getValue(1);
3187 VT = MVT::f32;
3188 }
3189 unsigned Opcode =
3191 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3192}
3193
3195 const SDLoc &dl, SelectionDAG &DAG) {
3196 EVT VT = LHS.getValueType();
3197 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3198
3199 if (VT.isFloatingPoint()) {
3200 assert(VT != MVT::f128);
3201 if (VT == MVT::f16 && !FullFP16) {
3202 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3203 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3204 VT = MVT::f32;
3205 }
3206 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3207 }
3208
3209 // The CMP instruction is just an alias for SUBS, and representing it as
3210 // SUBS means that it's possible to get CSE with subtract operations.
3211 // A later phase can perform the optimization of setting the destination
3212 // register to WZR/XZR if it ends up being unused.
3213 unsigned Opcode = AArch64ISD::SUBS;
3214
3215 if (isCMN(RHS, CC)) {
3216 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3217 Opcode = AArch64ISD::ADDS;
3218 RHS = RHS.getOperand(1);
3219 } else if (isCMN(LHS, CC)) {
3220 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3221 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3222 Opcode = AArch64ISD::ADDS;
3223 LHS = LHS.getOperand(1);
3224 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3225 if (LHS.getOpcode() == ISD::AND) {
3226 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3227 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3228 // of the signed comparisons.
3229 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3230 DAG.getVTList(VT, MVT_CC),
3231 LHS.getOperand(0),
3232 LHS.getOperand(1));
3233 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3234 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3235 return ANDSNode.getValue(1);
3236 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3237 // Use result of ANDS
3238 return LHS.getValue(1);
3239 }
3240 }
3241
3242 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3243 .getValue(1);
3244}
3245
3246/// \defgroup AArch64CCMP CMP;CCMP matching
3247///
3248/// These functions deal with the formation of CMP;CCMP;... sequences.
3249/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3250/// a comparison. They set the NZCV flags to a predefined value if their
3251/// predicate is false. This allows to express arbitrary conjunctions, for
3252/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3253/// expressed as:
3254/// cmp A
3255/// ccmp B, inv(CB), CA
3256/// check for CB flags
3257///
3258/// This naturally lets us implement chains of AND operations with SETCC
3259/// operands. And we can even implement some other situations by transforming
3260/// them:
3261/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3262/// negating the flags used in a CCMP/FCCMP operations.
3263/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3264/// by negating the flags we test for afterwards. i.e.
3265/// NEG (CMP CCMP CCCMP ...) can be implemented.
3266/// - Note that we can only ever negate all previously processed results.
3267/// What we can not implement by flipping the flags to test is a negation
3268/// of two sub-trees (because the negation affects all sub-trees emitted so
3269/// far, so the 2nd sub-tree we emit would also affect the first).
3270/// With those tools we can implement some OR operations:
3271/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3272/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3273/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3274/// elimination rules from earlier to implement the whole thing as a
3275/// CCMP/FCCMP chain.
3276///
3277/// As complete example:
3278/// or (or (setCA (cmp A)) (setCB (cmp B)))
3279/// (and (setCC (cmp C)) (setCD (cmp D)))"
3280/// can be reassociated to:
3281/// or (and (setCC (cmp C)) setCD (cmp D))
3282// (or (setCA (cmp A)) (setCB (cmp B)))
3283/// can be transformed to:
3284/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3285/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3286/// which can be implemented as:
3287/// cmp C
3288/// ccmp D, inv(CD), CC
3289/// ccmp A, CA, inv(CD)
3290/// ccmp B, CB, inv(CA)
3291/// check for CB flags
3292///
3293/// A counterexample is "or (and A B) (and C D)" which translates to
3294/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3295/// can only implement 1 of the inner (not) operations, but not both!
3296/// @{
3297
3298/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3300 ISD::CondCode CC, SDValue CCOp,
3301 AArch64CC::CondCode Predicate,
3302 AArch64CC::CondCode OutCC,
3303 const SDLoc &DL, SelectionDAG &DAG) {
3304 unsigned Opcode = 0;
3305 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3306
3307 if (LHS.getValueType().isFloatingPoint()) {
3308 assert(LHS.getValueType() != MVT::f128);
3309 if (LHS.getValueType() == MVT::f16 && !FullFP16) {
3310 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3311 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3312 }
3313 Opcode = AArch64ISD::FCCMP;
3314 } else if (RHS.getOpcode() == ISD::SUB) {
3315 SDValue SubOp0 = RHS.getOperand(0);
3316 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3317 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3318 Opcode = AArch64ISD::CCMN;
3319 RHS = RHS.getOperand(1);
3320 }
3321 }
3322 if (Opcode == 0)
3323 Opcode = AArch64ISD::CCMP;
3324
3325 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3327 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3328 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3329 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3330}
3331
3332/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3333/// expressed as a conjunction. See \ref AArch64CCMP.
3334/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3335/// changing the conditions on the SETCC tests.
3336/// (this means we can call emitConjunctionRec() with
3337/// Negate==true on this sub-tree)
3338/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3339/// cannot do the negation naturally. We are required to
3340/// emit the subtree first in this case.
3341/// \param WillNegate Is true if are called when the result of this
3342/// subexpression must be negated. This happens when the
3343/// outer expression is an OR. We can use this fact to know
3344/// that we have a double negation (or (or ...) ...) that
3345/// can be implemented for free.
3346static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3347 bool &MustBeFirst, bool WillNegate,
3348 unsigned Depth = 0) {
3349 if (!Val.hasOneUse())
3350 return false;
3351 unsigned Opcode = Val->getOpcode();
3352 if (Opcode == ISD::SETCC) {
3353 if (Val->getOperand(0).getValueType() == MVT::f128)
3354 return false;
3355 CanNegate = true;
3356 MustBeFirst = false;
3357 return true;
3358 }
3359 // Protect against exponential runtime and stack overflow.
3360 if (Depth > 6)
3361 return false;
3362 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3363 bool IsOR = Opcode == ISD::OR;
3364 SDValue O0 = Val->getOperand(0);
3365 SDValue O1 = Val->getOperand(1);
3366 bool CanNegateL;
3367 bool MustBeFirstL;
3368 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3369 return false;
3370 bool CanNegateR;
3371 bool MustBeFirstR;
3372 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3373 return false;
3374
3375 if (MustBeFirstL && MustBeFirstR)
3376 return false;
3377
3378 if (IsOR) {
3379 // For an OR expression we need to be able to naturally negate at least
3380 // one side or we cannot do the transformation at all.
3381 if (!CanNegateL && !CanNegateR)
3382 return false;
3383 // If we the result of the OR will be negated and we can naturally negate
3384 // the leafs, then this sub-tree as a whole negates naturally.
3385 CanNegate = WillNegate && CanNegateL && CanNegateR;
3386 // If we cannot naturally negate the whole sub-tree, then this must be
3387 // emitted first.
3388 MustBeFirst = !CanNegate;
3389 } else {
3390 assert(Opcode == ISD::AND && "Must be OR or AND");
3391 // We cannot naturally negate an AND operation.
3392 CanNegate = false;
3393 MustBeFirst = MustBeFirstL || MustBeFirstR;
3394 }
3395 return true;
3396 }
3397 return false;
3398}
3399
3400/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3401/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3402/// Tries to transform the given i1 producing node @p Val to a series compare
3403/// and conditional compare operations. @returns an NZCV flags producing node
3404/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3405/// transformation was not possible.
3406/// \p Negate is true if we want this sub-tree being negated just by changing
3407/// SETCC conditions.
3409 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3410 AArch64CC::CondCode Predicate) {
3411 // We're at a tree leaf, produce a conditional comparison operation.
3412 unsigned Opcode = Val->getOpcode();
3413 if (Opcode == ISD::SETCC) {
3414 SDValue LHS = Val->getOperand(0);
3415 SDValue RHS = Val->getOperand(1);
3416 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3417 bool isInteger = LHS.getValueType().isInteger();
3418 if (Negate)
3419 CC = getSetCCInverse(CC, LHS.getValueType());
3420 SDLoc DL(Val);
3421 // Determine OutCC and handle FP special case.
3422 if (isInteger) {
3423 OutCC = changeIntCCToAArch64CC(CC);
3424 } else {
3425 assert(LHS.getValueType().isFloatingPoint());
3426 AArch64CC::CondCode ExtraCC;
3427 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3428 // Some floating point conditions can't be tested with a single condition
3429 // code. Construct an additional comparison in this case.
3430 if (ExtraCC != AArch64CC::AL) {
3431 SDValue ExtraCmp;
3432 if (!CCOp.getNode())
3433 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3434 else
3435 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3436 ExtraCC, DL, DAG);
3437 CCOp = ExtraCmp;
3438 Predicate = ExtraCC;
3439 }
3440 }
3441
3442 // Produce a normal comparison if we are first in the chain
3443 if (!CCOp)
3444 return emitComparison(LHS, RHS, CC, DL, DAG);
3445 // Otherwise produce a ccmp.
3446 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3447 DAG);
3448 }
3449 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3450
3451 bool IsOR = Opcode == ISD::OR;
3452
3453 SDValue LHS = Val->getOperand(0);
3454 bool CanNegateL;
3455 bool MustBeFirstL;
3456 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3457 assert(ValidL && "Valid conjunction/disjunction tree");
3458 (void)ValidL;
3459
3460 SDValue RHS = Val->getOperand(1);
3461 bool CanNegateR;
3462 bool MustBeFirstR;
3463 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3464 assert(ValidR && "Valid conjunction/disjunction tree");
3465 (void)ValidR;
3466
3467 // Swap sub-tree that must come first to the right side.
3468 if (MustBeFirstL) {
3469 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3470 std::swap(LHS, RHS);
3471 std::swap(CanNegateL, CanNegateR);
3472 std::swap(MustBeFirstL, MustBeFirstR);
3473 }
3474
3475 bool NegateR;
3476 bool NegateAfterR;
3477 bool NegateL;
3478 bool NegateAfterAll;
3479 if (Opcode == ISD::OR) {
3480 // Swap the sub-tree that we can negate naturally to the left.
3481 if (!CanNegateL) {
3482 assert(CanNegateR && "at least one side must be negatable");
3483 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3484 assert(!Negate);
3485 std::swap(LHS, RHS);
3486 NegateR = false;
3487 NegateAfterR = true;
3488 } else {
3489 // Negate the left sub-tree if possible, otherwise negate the result.
3490 NegateR = CanNegateR;
3491 NegateAfterR = !CanNegateR;
3492 }
3493 NegateL = true;
3494 NegateAfterAll = !Negate;
3495 } else {
3496 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3497 assert(!Negate && "Valid conjunction/disjunction tree");
3498
3499 NegateL = false;
3500 NegateR = false;
3501 NegateAfterR = false;
3502 NegateAfterAll = false;
3503 }
3504
3505 // Emit sub-trees.
3506 AArch64CC::CondCode RHSCC;
3507 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3508 if (NegateAfterR)
3509 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3510 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3511 if (NegateAfterAll)
3512 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3513 return CmpL;
3514}
3515
3516/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3517/// In some cases this is even possible with OR operations in the expression.
3518/// See \ref AArch64CCMP.
3519/// \see emitConjunctionRec().
3521 AArch64CC::CondCode &OutCC) {
3522 bool DummyCanNegate;
3523 bool DummyMustBeFirst;
3524 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3525 return SDValue();
3526
3527 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3528}
3529
3530/// @}
3531
3532/// Returns how profitable it is to fold a comparison's operand's shift and/or
3533/// extension operations.
3535 auto isSupportedExtend = [&](SDValue V) {
3536 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3537 return true;
3538
3539 if (V.getOpcode() == ISD::AND)
3540 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3541 uint64_t Mask = MaskCst->getZExtValue();
3542 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3543 }
3544
3545 return false;
3546 };
3547
3548 if (!Op.hasOneUse())
3549 return 0;
3550
3551 if (isSupportedExtend(Op))
3552 return 1;
3553
3554 unsigned Opc = Op.getOpcode();
3555 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3556 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3557 uint64_t Shift = ShiftCst->getZExtValue();
3558 if (isSupportedExtend(Op.getOperand(0)))
3559 return (Shift <= 4) ? 2 : 1;
3560 EVT VT = Op.getValueType();
3561 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3562 return 1;
3563 }
3564
3565 return 0;
3566}
3567
3569 SDValue &AArch64cc, SelectionDAG &DAG,
3570 const SDLoc &dl) {
3571 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3572 EVT VT = RHS.getValueType();
3573 uint64_t C = RHSC->getZExtValue();
3574 if (!isLegalArithImmed(C)) {
3575 // Constant does not fit, try adjusting it by one?
3576 switch (CC) {
3577 default:
3578 break;
3579 case ISD::SETLT:
3580 case ISD::SETGE:
3581 if ((VT == MVT::i32 && C != 0x80000000 &&
3582 isLegalArithImmed((uint32_t)(C - 1))) ||
3583 (VT == MVT::i64 && C != 0x80000000ULL &&
3584 isLegalArithImmed(C - 1ULL))) {
3586 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3587 RHS = DAG.getConstant(C, dl, VT);
3588 }
3589 break;
3590 case ISD::SETULT:
3591 case ISD::SETUGE:
3592 if ((VT == MVT::i32 && C != 0 &&
3593 isLegalArithImmed((uint32_t)(C - 1))) ||
3594 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3596 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3597 RHS = DAG.getConstant(C, dl, VT);
3598 }
3599 break;
3600 case ISD::SETLE:
3601 case ISD::SETGT:
3602 if ((VT == MVT::i32 && C != INT32_MAX &&
3603 isLegalArithImmed((uint32_t)(C + 1))) ||
3604 (VT == MVT::i64 && C != INT64_MAX &&
3605 isLegalArithImmed(C + 1ULL))) {
3607 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3608 RHS = DAG.getConstant(C, dl, VT);
3609 }
3610 break;
3611 case ISD::SETULE:
3612 case ISD::SETUGT:
3613 if ((VT == MVT::i32 && C != UINT32_MAX &&
3614 isLegalArithImmed((uint32_t)(C + 1))) ||
3615 (VT == MVT::i64 && C != UINT64_MAX &&
3616 isLegalArithImmed(C + 1ULL))) {
3618 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3619 RHS = DAG.getConstant(C, dl, VT);
3620 }
3621 break;
3622 }
3623 }
3624 }
3625
3626 // Comparisons are canonicalized so that the RHS operand is simpler than the
3627 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3628 // can fold some shift+extend operations on the RHS operand, so swap the
3629 // operands if that can be done.
3630 //
3631 // For example:
3632 // lsl w13, w11, #1
3633 // cmp w13, w12
3634 // can be turned into:
3635 // cmp w12, w11, lsl #1
3636 if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
3637 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3638
3640 std::swap(LHS, RHS);
3642 }
3643 }
3644
3645 SDValue Cmp;
3646 AArch64CC::CondCode AArch64CC;
3647 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3648 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3649
3650 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3651 // For the i8 operand, the largest immediate is 255, so this can be easily
3652 // encoded in the compare instruction. For the i16 operand, however, the
3653 // largest immediate cannot be encoded in the compare.
3654 // Therefore, use a sign extending load and cmn to avoid materializing the
3655 // -1 constant. For example,
3656 // movz w1, #65535
3657 // ldrh w0, [x0, #0]
3658 // cmp w0, w1
3659 // >
3660 // ldrsh w0, [x0, #0]
3661 // cmn w0, #1
3662 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3663 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3664 // ensure both the LHS and RHS are truly zero extended and to make sure the
3665 // transformation is profitable.
3666 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3667 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3668 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3669 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3670 int16_t ValueofRHS = RHS->getAsZExtVal();
3671 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3672 SDValue SExt =
3673 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3674 DAG.getValueType(MVT::i16));
3675 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3676 RHS.getValueType()),
3677 CC, dl, DAG);
3678 AArch64CC = changeIntCCToAArch64CC(CC);
3679 }
3680 }
3681
3682 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3683 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3684 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3685 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3686 }
3687 }
3688 }
3689
3690 if (!Cmp) {
3691 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3692 AArch64CC = changeIntCCToAArch64CC(CC);
3693 }
3694 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3695 return Cmp;
3696}
3697
3698static std::pair<SDValue, SDValue>
3700 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3701 "Unsupported value type");
3702 SDValue Value, Overflow;
3703 SDLoc DL(Op);
3704 SDValue LHS = Op.getOperand(0);
3705 SDValue RHS = Op.getOperand(1);
3706 unsigned Opc = 0;
3707 switch (Op.getOpcode()) {
3708 default:
3709 llvm_unreachable("Unknown overflow instruction!");
3710 case ISD::SADDO:
3711 Opc = AArch64ISD::ADDS;
3712 CC = AArch64CC::VS;
3713 break;
3714 case ISD::UADDO:
3715 Opc = AArch64ISD::ADDS;
3716 CC = AArch64CC::HS;
3717 break;
3718 case ISD::SSUBO:
3719 Opc = AArch64ISD::SUBS;
3720 CC = AArch64CC::VS;
3721 break;
3722 case ISD::USUBO:
3723 Opc = AArch64ISD::SUBS;
3724 CC = AArch64CC::LO;
3725 break;
3726 // Multiply needs a little bit extra work.
3727 case ISD::SMULO:
3728 case ISD::UMULO: {
3729 CC = AArch64CC::NE;
3730 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3731 if (Op.getValueType() == MVT::i32) {
3732 // Extend to 64-bits, then perform a 64-bit multiply.
3733 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3734 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3735 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3736 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3737 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3738
3739 // Check that the result fits into a 32-bit integer.
3740 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3741 if (IsSigned) {
3742 // cmp xreg, wreg, sxtw
3743 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3744 Overflow =
3745 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3746 } else {
3747 // tst xreg, #0xffffffff00000000
3748 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3749 Overflow =
3750 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3751 }
3752 break;
3753 }
3754 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3755 // For the 64 bit multiply
3756 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3757 if (IsSigned) {
3758 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3759 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3760 DAG.getConstant(63, DL, MVT::i64));
3761 // It is important that LowerBits is last, otherwise the arithmetic
3762 // shift will not be folded into the compare (SUBS).
3763 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3764 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3765 .getValue(1);
3766 } else {
3767 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3768 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3769 Overflow =
3770 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3771 DAG.getConstant(0, DL, MVT::i64),
3772 UpperBits).getValue(1);
3773 }
3774 break;
3775 }
3776 } // switch (...)
3777
3778 if (Opc) {
3779 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3780
3781 // Emit the AArch64 operation with overflow check.
3782 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3783 Overflow = Value.getValue(1);
3784 }
3785 return std::make_pair(Value, Overflow);
3786}
3787
3788SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3789 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
3790 !Subtarget->isNeonAvailable()))
3791 return LowerToScalableOp(Op, DAG);
3792
3793 SDValue Sel = Op.getOperand(0);
3794 SDValue Other = Op.getOperand(1);
3795 SDLoc dl(Sel);
3796
3797 // If the operand is an overflow checking operation, invert the condition
3798 // code and kill the Not operation. I.e., transform:
3799 // (xor (overflow_op_bool, 1))
3800 // -->
3801 // (csel 1, 0, invert(cc), overflow_op_bool)
3802 // ... which later gets transformed to just a cset instruction with an
3803 // inverted condition code, rather than a cset + eor sequence.
3805 // Only lower legal XALUO ops.
3807 return SDValue();
3808
3809 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3810 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3812 SDValue Value, Overflow;
3813 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3814 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3815 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3816 CCVal, Overflow);
3817 }
3818 // If neither operand is a SELECT_CC, give up.
3819 if (Sel.getOpcode() != ISD::SELECT_CC)
3820 std::swap(Sel, Other);
3821 if (Sel.getOpcode() != ISD::SELECT_CC)
3822 return Op;
3823
3824 // The folding we want to perform is:
3825 // (xor x, (select_cc a, b, cc, 0, -1) )
3826 // -->
3827 // (csel x, (xor x, -1), cc ...)
3828 //
3829 // The latter will get matched to a CSINV instruction.
3830
3831 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3832 SDValue LHS = Sel.getOperand(0);
3833 SDValue RHS = Sel.getOperand(1);
3834 SDValue TVal = Sel.getOperand(2);
3835 SDValue FVal = Sel.getOperand(3);
3836
3837 // FIXME: This could be generalized to non-integer comparisons.
3838 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3839 return Op;
3840
3841 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3842 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3843
3844 // The values aren't constants, this isn't the pattern we're looking for.
3845 if (!CFVal || !CTVal)
3846 return Op;
3847
3848 // We can commute the SELECT_CC by inverting the condition. This
3849 // might be needed to make this fit into a CSINV pattern.
3850 if (CTVal->isAllOnes() && CFVal->isZero()) {
3851 std::swap(TVal, FVal);
3852 std::swap(CTVal, CFVal);
3853 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3854 }
3855
3856 // If the constants line up, perform the transform!
3857 if (CTVal->isZero() && CFVal->isAllOnes()) {
3858 SDValue CCVal;
3859 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3860
3861 FVal = Other;
3862 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3863 DAG.getConstant(-1ULL, dl, Other.getValueType()));
3864
3865 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3866 CCVal, Cmp);
3867 }
3868
3869 return Op;
3870}
3871
3872// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3873// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3874// sets 'C' bit to 0.
3876 SDLoc DL(Value);
3877 EVT VT = Value.getValueType();
3878 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3879 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3880 SDValue Cmp =
3881 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3882 return Cmp.getValue(1);
3883}
3884
3885// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3886// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3888 bool Invert) {
3889 assert(Glue.getResNo() == 1);
3890 SDLoc DL(Glue);
3891 SDValue Zero = DAG.getConstant(0, DL, VT);
3892 SDValue One = DAG.getConstant(1, DL, VT);
3893 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3894 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3895 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3896}
3897
3898// Value is 1 if 'V' bit of NZCV is 1, else 0
3900 assert(Glue.getResNo() == 1);
3901 SDLoc DL(Glue);
3902 SDValue Zero = DAG.getConstant(0, DL, VT);
3903 SDValue One = DAG.getConstant(1, DL, VT);
3904 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3905 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3906}
3907
3908// This lowering is inefficient, but it will get cleaned up by
3909// `foldOverflowCheck`
3911 unsigned Opcode, bool IsSigned) {
3912 EVT VT0 = Op.getValue(0).getValueType();
3913 EVT VT1 = Op.getValue(1).getValueType();
3914
3915 if (VT0 != MVT::i32 && VT0 != MVT::i64)
3916 return SDValue();
3917
3918 bool InvertCarry = Opcode == AArch64ISD::SBCS;
3919 SDValue OpLHS = Op.getOperand(0);
3920 SDValue OpRHS = Op.getOperand(1);
3921 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3922
3923 SDLoc DL(Op);
3924 SDVTList VTs = DAG.getVTList(VT0, VT1);
3925
3926 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3927 OpRHS, OpCarryIn);
3928
3929 SDValue OutFlag =
3930 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3931 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3932
3933 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3934}
3935
3937 // Let legalize expand this if it isn't a legal type yet.
3938 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3939 return SDValue();
3940
3941 SDLoc dl(Op);
3943 // The actual operation that sets the overflow or carry flag.
3944 SDValue Value, Overflow;
3945 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3946
3947 // We use 0 and 1 as false and true values.
3948 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3949 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3950
3951 // We use an inverted condition, because the conditional select is inverted
3952 // too. This will allow it to be selected to a single instruction:
3953 // CSINC Wd, WZR, WZR, invert(cond).
3954 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3955 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3956 CCVal, Overflow);
3957
3958 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3959 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3960}
3961
3962// Prefetch operands are:
3963// 1: Address to prefetch
3964// 2: bool isWrite
3965// 3: int locality (0 = no locality ... 3 = extreme locality)
3966// 4: bool isDataCache
3968 SDLoc DL(Op);
3969 unsigned IsWrite = Op.getConstantOperandVal(2);
3970 unsigned Locality = Op.getConstantOperandVal(3);
3971 unsigned IsData = Op.getConstantOperandVal(4);
3972
3973 bool IsStream = !Locality;
3974 // When the locality number is set
3975 if (Locality) {
3976 // The front-end should have filtered out the out-of-range values
3977 assert(Locality <= 3 && "Prefetch locality out-of-range");
3978 // The locality degree is the opposite of the cache speed.
3979 // Put the number the other way around.
3980 // The encoding starts at 0 for level 1
3981 Locality = 3 - Locality;
3982 }
3983
3984 // built the mask value encoding the expected behavior.
3985 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
3986 (!IsData << 3) | // IsDataCache bit
3987 (Locality << 1) | // Cache level bits
3988 (unsigned)IsStream; // Stream bit
3989 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
3990 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
3991 Op.getOperand(1));
3992}
3993
3994SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
3995 SelectionDAG &DAG) const {
3996 EVT VT = Op.getValueType();
3997 if (VT.isScalableVector())
3998 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
3999
4000 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4001 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4002
4003 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4004 return SDValue();
4005}
4006
4007SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4008 SelectionDAG &DAG) const {
4009 if (Op.getValueType().isScalableVector())
4010 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4011
4012 bool IsStrict = Op->isStrictFPOpcode();
4013 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4014 EVT SrcVT = SrcVal.getValueType();
4015
4016 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4017 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4018
4019 if (SrcVT != MVT::f128) {
4020 // Expand cases where the input is a vector bigger than NEON.
4022 return SDValue();
4023
4024 // It's legal except when f128 is involved
4025 return Op;
4026 }
4027
4028 return SDValue();
4029}
4030
4031SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4032 SelectionDAG &DAG) const {
4033 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4034 // Any additional optimization in this function should be recorded
4035 // in the cost tables.
4036 bool IsStrict = Op->isStrictFPOpcode();
4037 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4038 EVT VT = Op.getValueType();
4039
4040 if (VT.isScalableVector()) {
4041 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4044 return LowerToPredicatedOp(Op, DAG, Opcode);
4045 }
4046
4047 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4048 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4049 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4050
4051 unsigned NumElts = InVT.getVectorNumElements();
4052
4053 // f16 conversions are promoted to f32 when full fp16 is not supported.
4054 if (InVT.getVectorElementType() == MVT::f16 &&
4055 !Subtarget->hasFullFP16()) {
4056 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4057 SDLoc dl(Op);
4058 if (IsStrict) {
4059 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4060 {Op.getOperand(0), Op.getOperand(1)});
4061 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4062 {Ext.getValue(1), Ext.getValue(0)});
4063 }
4064 return DAG.getNode(
4065 Op.getOpcode(), dl, Op.getValueType(),
4066 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4067 }
4068
4069 uint64_t VTSize = VT.getFixedSizeInBits();
4070 uint64_t InVTSize = InVT.getFixedSizeInBits();
4071 if (VTSize < InVTSize) {
4072 SDLoc dl(Op);
4073 if (IsStrict) {
4075 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4076 {Op.getOperand(0), Op.getOperand(1)});
4077 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4078 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4079 }
4080 SDValue Cv =
4081 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4082 Op.getOperand(0));
4083 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4084 }
4085
4086 if (VTSize > InVTSize) {
4087 SDLoc dl(Op);
4088 MVT ExtVT =
4091 if (IsStrict) {
4092 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4093 {Op.getOperand(0), Op.getOperand(1)});
4094 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4095 {Ext.getValue(1), Ext.getValue(0)});
4096 }
4097 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4098 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4099 }
4100
4101 // Use a scalar operation for conversions between single-element vectors of
4102 // the same size.
4103 if (NumElts == 1) {
4104 SDLoc dl(Op);
4105 SDValue Extract = DAG.getNode(
4107 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4108 EVT ScalarVT = VT.getScalarType();
4109 if (IsStrict)
4110 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4111 {Op.getOperand(0), Extract});
4112 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4113 }
4114
4115 // Type changing conversions are illegal.
4116 return Op;
4117}
4118
4119SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4120 SelectionDAG &DAG) const {
4121 bool IsStrict = Op->isStrictFPOpcode();
4122 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4123
4124 if (SrcVal.getValueType().isVector())
4125 return LowerVectorFP_TO_INT(Op, DAG);
4126
4127 // f16 conversions are promoted to f32 when full fp16 is not supported.
4128 if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4129 SDLoc dl(Op);
4130 if (IsStrict) {
4131 SDValue Ext =
4132 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4133 {Op.getOperand(0), SrcVal});
4134 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4135 {Ext.getValue(1), Ext.getValue(0)});
4136 }
4137 return DAG.getNode(
4138 Op.getOpcode(), dl, Op.getValueType(),
4139 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4140 }
4141
4142 if (SrcVal.getValueType() != MVT::f128) {
4143 // It's legal except when f128 is involved
4144 return Op;
4145 }
4146
4147 return SDValue();
4148}
4149
4150SDValue
4151AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4152 SelectionDAG &DAG) const {
4153 // AArch64 FP-to-int conversions saturate to the destination element size, so
4154 // we can lower common saturating conversions to simple instructions.
4155 SDValue SrcVal = Op.getOperand(0);
4156 EVT SrcVT = SrcVal.getValueType();
4157 EVT DstVT = Op.getValueType();
4158 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4159
4160 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4161 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4162 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4163 assert(SatWidth <= DstElementWidth &&
4164 "Saturation width cannot exceed result width");
4165
4166 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4167 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4168 // types, so this is hard to reach.
4169 if (DstVT.isScalableVector())
4170 return SDValue();
4171
4172 EVT SrcElementVT = SrcVT.getVectorElementType();
4173
4174 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4175 if (SrcElementVT == MVT::f16 &&
4176 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) {
4177 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4178 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4179 SrcVT = F32VT;
4180 SrcElementVT = MVT::f32;
4181 SrcElementWidth = 32;
4182 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4183 SrcElementVT != MVT::f16)
4184 return SDValue();
4185
4186 SDLoc DL(Op);
4187 // Cases that we can emit directly.
4188 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4189 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4190 DAG.getValueType(DstVT.getScalarType()));
4191
4192 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4193 // result. This is only valid if the legal cvt is larger than the saturate
4194 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4195 // (at least until sqxtn is selected).
4196 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4197 return SDValue();
4198
4199 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4200 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4201 DAG.getValueType(IntVT.getScalarType()));
4202 SDValue Sat;
4203 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4204 SDValue MinC = DAG.getConstant(
4205 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4206 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4207 SDValue MaxC = DAG.getConstant(
4208 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4209 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4210 } else {
4211 SDValue MinC = DAG.getConstant(
4212 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4213 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4214 }
4215
4216 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4217}
4218
4219SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4220 SelectionDAG &DAG) const {
4221 // AArch64 FP-to-int conversions saturate to the destination register size, so
4222 // we can lower common saturating conversions to simple instructions.
4223 SDValue SrcVal = Op.getOperand(0);
4224 EVT SrcVT = SrcVal.getValueType();
4225
4226 if (SrcVT.isVector())
4227 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4228
4229 EVT DstVT = Op.getValueType();
4230 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4231 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4232 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4233 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4234
4235 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4236 if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) {
4237 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4238 SrcVT = MVT::f32;
4239 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16)
4240 return SDValue();
4241
4242 SDLoc DL(Op);
4243 // Cases that we can emit directly.
4244 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4245 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4246 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4247 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4248 DAG.getValueType(DstVT));
4249
4250 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4251 // result. This is only valid if the legal cvt is larger than the saturate
4252 // width.
4253 if (DstWidth < SatWidth)
4254 return SDValue();
4255
4256 SDValue NativeCvt =
4257 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4258 SDValue Sat;
4259 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4260 SDValue MinC = DAG.getConstant(
4261 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4262 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4263 SDValue MaxC = DAG.getConstant(
4264 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4265 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4266 } else {
4267 SDValue MinC = DAG.getConstant(
4268 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4269 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4270 }
4271
4272 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4273}
4274
4275SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4276 SelectionDAG &DAG) const {
4277 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4278 // Any additional optimization in this function should be recorded
4279 // in the cost tables.
4280 bool IsStrict = Op->isStrictFPOpcode();
4281 EVT VT = Op.getValueType();
4282 SDLoc dl(Op);
4283 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4284 EVT InVT = In.getValueType();
4285 unsigned Opc = Op.getOpcode();
4286 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4287
4288 if (VT.isScalableVector()) {
4289 if (InVT.getVectorElementType() == MVT::i1) {
4290 // We can't directly extend an SVE predicate; extend it first.
4291 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4292 EVT CastVT = getPromotedVTForPredicate(InVT);
4293 In = DAG.getNode(CastOpc, dl, CastVT, In);
4294 return DAG.getNode(Opc, dl, VT, In);
4295 }
4296
4297 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4299 return LowerToPredicatedOp(Op, DAG, Opcode);
4300 }
4301
4302 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4303 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4304 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4305
4306 uint64_t VTSize = VT.getFixedSizeInBits();
4307 uint64_t InVTSize = InVT.getFixedSizeInBits();
4308 if (VTSize < InVTSize) {
4309 MVT CastVT =
4311 InVT.getVectorNumElements());
4312 if (IsStrict) {
4313 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4314 {Op.getOperand(0), In});
4315 return DAG.getNode(
4316 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4317 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4318 }
4319 In = DAG.getNode(Opc, dl, CastVT, In);
4320 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4321 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4322 }
4323
4324 if (VTSize > InVTSize) {
4325 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4327 In = DAG.getNode(CastOpc, dl, CastVT, In);
4328 if (IsStrict)
4329 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4330 return DAG.getNode(Opc, dl, VT, In);
4331 }
4332
4333 // Use a scalar operation for conversions between single-element vectors of
4334 // the same size.
4335 if (VT.getVectorNumElements() == 1) {
4336 SDValue Extract = DAG.getNode(
4338 In, DAG.getConstant(0, dl, MVT::i64));
4339 EVT ScalarVT = VT.getScalarType();
4340 if (IsStrict)
4341 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4342 {Op.getOperand(0), Extract});
4343 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4344 }
4345
4346 return Op;
4347}
4348
4349SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4350 SelectionDAG &DAG) const {
4351 if (Op.getValueType().isVector())
4352 return LowerVectorINT_TO_FP(Op, DAG);
4353
4354 bool IsStrict = Op->isStrictFPOpcode();
4355 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4356
4357 // f16 conversions are promoted to f32 when full fp16 is not supported.
4358 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4359 SDLoc dl(Op);
4360 if (IsStrict) {
4361 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other},
4362 {Op.getOperand(0), SrcVal});
4363 return DAG.getNode(
4364 ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other},
4365 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4366 }
4367 return DAG.getNode(
4368 ISD::FP_ROUND, dl, MVT::f16,
4369 DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
4370 DAG.getIntPtrConstant(0, dl));
4371 }
4372
4373 // i128 conversions are libcalls.
4374 if (SrcVal.getValueType() == MVT::i128)
4375 return SDValue();
4376
4377 // Other conversions are legal, unless it's to the completely software-based
4378 // fp128.
4379 if (Op.getValueType() != MVT::f128)
4380 return Op;
4381 return SDValue();
4382}
4383
4384SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4385 SelectionDAG &DAG) const {
4386 // For iOS, we want to call an alternative entry point: __sincos_stret,
4387 // which returns the values in two S / D registers.
4388 SDLoc dl(Op);
4389 SDValue Arg = Op.getOperand(0);
4390 EVT ArgVT = Arg.getValueType();
4391 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4392
4394 ArgListEntry Entry;
4395
4396 Entry.Node = Arg;
4397 Entry.Ty = ArgTy;
4398 Entry.IsSExt = false;
4399 Entry.IsZExt = false;
4400 Args.push_back(Entry);
4401
4402 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4403 : RTLIB::SINCOS_STRET_F32;
4404 const char *LibcallName = getLibcallName(LC);
4405 SDValue Callee =
4406 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4407
4408 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4410 CLI.setDebugLoc(dl)
4411 .setChain(DAG.getEntryNode())
4412 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4413
4414 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4415 return CallResult.first;
4416}
4417
4418static MVT getSVEContainerType(EVT ContentTy);
4419
4420SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4421 SelectionDAG &DAG) const {
4422 EVT OpVT = Op.getValueType();
4423 EVT ArgVT = Op.getOperand(0).getValueType();
4424
4426 return LowerFixedLengthBitcastToSVE(Op, DAG);
4427
4428 if (OpVT.isScalableVector()) {
4429 // Bitcasting between unpacked vector types of different element counts is
4430 // not a NOP because the live elements are laid out differently.
4431 // 01234567
4432 // e.g. nxv2i32 = XX??XX??
4433 // nxv4f16 = X?X?X?X?
4434 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4435 return SDValue();
4436
4437 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4438 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4439 "Expected int->fp bitcast!");
4440 SDValue ExtResult =
4442 Op.getOperand(0));
4443 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4444 }
4445 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4446 }
4447
4448 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4449 return SDValue();
4450
4451 // Bitcasts between f16 and bf16 are legal.
4452 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4453 return Op;
4454
4455 assert(ArgVT == MVT::i16);
4456 SDLoc DL(Op);
4457
4458 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4459 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4460 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4461}
4462
4463static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4464 if (OrigVT.getSizeInBits() >= 64)
4465 return OrigVT;
4466
4467 assert(OrigVT.isSimple() && "Expecting a simple value type");
4468
4469 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4470 switch (OrigSimpleTy) {
4471 default: llvm_unreachable("Unexpected Vector Type");
4472 case MVT::v2i8:
4473 case MVT::v2i16:
4474 return MVT::v2i32;
4475 case MVT::v4i8:
4476 return MVT::v4i16;
4477 }
4478}
4479
4481 const EVT &OrigTy,
4482 const EVT &ExtTy,
4483 unsigned ExtOpcode) {
4484 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4485 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4486 // 64-bits we need to insert a new extension so that it will be 64-bits.
4487 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4488 if (OrigTy.getSizeInBits() >= 64)
4489 return N;
4490
4491 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4492 EVT NewVT = getExtensionTo64Bits(OrigTy);
4493
4494 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4495}
4496
4497// Returns lane if Op extracts from a two-element vector and lane is constant
4498// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4499static std::optional<uint64_t>
4501 SDNode *OpNode = Op.getNode();
4502 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4503 return std::nullopt;
4504
4505 EVT VT = OpNode->getOperand(0).getValueType();
4506 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4507 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4508 return std::nullopt;
4509
4510 return C->getZExtValue();
4511}
4512
4514 bool isSigned) {
4515 EVT VT = N.getValueType();
4516
4517 if (N.getOpcode() != ISD::BUILD_VECTOR)
4518 return false;
4519
4520 for (const SDValue &Elt : N->op_values()) {
4521 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4522 unsigned EltSize = VT.getScalarSizeInBits();
4523 unsigned HalfSize = EltSize / 2;
4524 if (isSigned) {
4525 if (!isIntN(HalfSize, C->getSExtValue()))
4526 return false;
4527 } else {
4528 if (!isUIntN(HalfSize, C->getZExtValue()))
4529 return false;
4530 }
4531 continue;
4532 }
4533 return false;
4534 }
4535
4536 return true;
4537}
4538
4540 EVT VT = N.getValueType();
4541 assert(VT.is128BitVector() && "Unexpected vector MULL size");
4542
4543 unsigned NumElts = VT.getVectorNumElements();
4544 unsigned OrigEltSize = VT.getScalarSizeInBits();
4545 unsigned EltSize = OrigEltSize / 2;
4546 MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
4547
4548 APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
4549 if (DAG.MaskedValueIsZero(N, HiBits))
4550 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
4551
4552 if (ISD::isExtOpcode(N.getOpcode()))
4553 return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
4554 N.getOperand(0).getValueType(), VT,
4555 N.getOpcode());
4556
4557 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4558 SDLoc dl(N);
4560 for (unsigned i = 0; i != NumElts; ++i) {
4561 const APInt &CInt = N.getConstantOperandAPInt(i);
4562 // Element types smaller than 32 bits are not legal, so use i32 elements.
4563 // The values are implicitly truncated so sext vs. zext doesn't matter.
4564 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4565 }
4566 return DAG.getBuildVector(TruncVT, dl, Ops);
4567}
4568
4570 return N.getOpcode() == ISD::SIGN_EXTEND ||
4571 N.getOpcode() == ISD::ANY_EXTEND ||
4572 isExtendedBUILD_VECTOR(N, DAG, true);
4573}
4574
4576 return N.getOpcode() == ISD::ZERO_EXTEND ||
4577 N.getOpcode() == ISD::ANY_EXTEND ||
4578 isExtendedBUILD_VECTOR(N, DAG, false);
4579}
4580
4582 unsigned Opcode = N.getOpcode();
4583 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4584 SDValue N0 = N.getOperand(0);
4585 SDValue N1 = N.getOperand(1);
4586 return N0->hasOneUse() && N1->hasOneUse() &&
4587 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4588 }
4589 return false;
4590}
4591
4593 unsigned Opcode = N.getOpcode();
4594 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4595 SDValue N0 = N.getOperand(0);
4596 SDValue N1 = N.getOperand(1);
4597 return N0->hasOneUse() && N1->hasOneUse() &&
4598 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4599 }
4600 return false;
4601}
4602
4603SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
4604 SelectionDAG &DAG) const {
4605 // The rounding mode is in bits 23:22 of the FPSCR.
4606 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4607 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4608 // so that the shift + and get folded into a bitfield extract.
4609 SDLoc dl(Op);
4610
4611 SDValue Chain = Op.getOperand(0);
4612 SDValue FPCR_64 = DAG.getNode(
4613 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4614 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4615 Chain = FPCR_64.getValue(1);
4616 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4617 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4618 DAG.getConstant(1U << 22, dl, MVT::i32));
4619 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4620 DAG.getConstant(22, dl, MVT::i32));
4621 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4622 DAG.getConstant(3, dl, MVT::i32));
4623 return DAG.getMergeValues({AND, Chain}, dl);
4624}
4625
4626SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4627 SelectionDAG &DAG) const {
4628 SDLoc DL(Op);
4629 SDValue Chain = Op->getOperand(0);
4630 SDValue RMValue = Op->getOperand(1);
4631
4632 // The rounding mode is in bits 23:22 of the FPCR.
4633 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4634 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4635 // ((arg - 1) & 3) << 22).
4636 //
4637 // The argument of llvm.set.rounding must be within the segment [0, 3], so
4638 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4639 // generated llvm.set.rounding to ensure this condition.
4640
4641 // Calculate new value of FPCR[23:22].
4642 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4643 DAG.getConstant(1, DL, MVT::i32));
4644 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4645 DAG.getConstant(0x3, DL, MVT::i32));
4646 RMValue =
4647 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4648 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4649 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4650
4651 // Get current value of FPCR.
4652 SDValue Ops[] = {
4653 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4654 SDValue FPCR =
4655 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4656 Chain = FPCR.getValue(1);
4657 FPCR = FPCR.getValue(0);
4658
4659 // Put new rounding mode into FPSCR[23:22].
4660 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4661 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4662 DAG.getConstant(RMMask, DL, MVT::i64));
4663 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4664 SDValue Ops2[] = {
4665 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4666 FPCR};
4667 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4668}
4669
4670static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
4671 SDLoc DL, bool &IsMLA) {
4672 bool IsN0SExt = isSignExtended(N0, DAG);
4673 bool IsN1SExt = isSignExtended(N1, DAG);
4674 if (IsN0SExt && IsN1SExt)
4675 return AArch64ISD::SMULL;
4676
4677 bool IsN0ZExt = isZeroExtended(N0, DAG);
4678 bool IsN1ZExt = isZeroExtended(N1, DAG);
4679
4680 if (IsN0ZExt && IsN1ZExt)
4681 return AArch64ISD::UMULL;
4682
4683 // Select SMULL if we can replace zext with sext.
4684 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
4685 !isExtendedBUILD_VECTOR(N0, DAG, false) &&
4686 !isExtendedBUILD_VECTOR(N1, DAG, false)) {
4687 SDValue ZextOperand;
4688 if (IsN0ZExt)
4689 ZextOperand = N0.getOperand(0);
4690 else
4691 ZextOperand = N1.getOperand(0);
4692 if (DAG.SignBitIsZero(ZextOperand)) {
4693 SDValue NewSext =
4694 DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
4695 if (IsN0ZExt)
4696 N0 = NewSext;
4697 else
4698 N1 = NewSext;
4699 return AArch64ISD::SMULL;
4700 }
4701 }
4702
4703 // Select UMULL if we can replace the other operand with an extend.
4704 if (IsN0ZExt || IsN1ZExt) {
4705 EVT VT = N0.getValueType();
4707 VT.getScalarSizeInBits() / 2);
4708 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
4709 return AArch64ISD::UMULL;
4710 }
4711
4712 if (!IsN1SExt && !IsN1ZExt)
4713 return 0;
4714
4715 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4716 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4717 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
4718 IsMLA = true;
4719 return AArch64ISD::SMULL;
4720 }
4721 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
4722 IsMLA = true;
4723 return AArch64ISD::UMULL;
4724 }
4725 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
4726 std::swap(N0, N1);
4727 IsMLA = true;
4728 return AArch64ISD::UMULL;
4729 }
4730 return 0;
4731}
4732
4733SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
4734 EVT VT = Op.getValueType();
4735
4736 bool OverrideNEON = !Subtarget->isNeonAvailable();
4737 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
4738 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4739
4740 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
4741 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
4742 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
4743 "unexpected type for custom-lowering ISD::MUL");
4744 SDValue N0 = Op.getOperand(0);
4745 SDValue N1 = Op.getOperand(1);
4746 bool isMLA = false;
4747 EVT OVT = VT;
4748 if (VT.is64BitVector()) {
4749 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4750 isNullConstant(N0.getOperand(1)) &&
4752 isNullConstant(N1.getOperand(1))) {
4753 N0 = N0.getOperand(0);
4754 N1 = N1.getOperand(0);
4755 VT = N0.getValueType();
4756 } else {
4757 if (VT == MVT::v1i64) {
4758 if (Subtarget->hasSVE())
4759 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4760 // Fall through to expand this. It is not legal.
4761 return SDValue();
4762 } else
4763 // Other vector multiplications are legal.
4764 return Op;
4765 }
4766 }
4767
4768 SDLoc DL(Op);
4769 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
4770
4771 if (!NewOpc) {
4772 if (VT.getVectorElementType() == MVT::i64) {
4773 // If SVE is available then i64 vector multiplications can also be made
4774 // legal.
4775 if (Subtarget->hasSVE())
4776 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4777 // Fall through to expand this. It is not legal.
4778 return SDValue();
4779 } else
4780 // Other vector multiplications are legal.
4781 return Op;
4782 }
4783
4784 // Legalize to a S/UMULL instruction
4785 SDValue Op0;
4786 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
4787 if (!isMLA) {
4788 Op0 = skipExtensionForVectorMULL(N0, DAG);
4790 Op1.getValueType().is64BitVector() &&
4791 "unexpected types for extended operands to VMULL");
4792 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
4793 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
4794 DAG.getConstant(0, DL, MVT::i64));
4795 }
4796 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
4797 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
4798 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
4801 EVT Op1VT = Op1.getValueType();
4802 return DAG.getNode(
4804 DAG.getNode(N0.getOpcode(), DL, VT,
4805 DAG.getNode(NewOpc, DL, VT,
4806 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
4807 DAG.getNode(NewOpc, DL, VT,
4808 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
4809 DAG.getConstant(0, DL, MVT::i64));
4810}
4811
4812static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
4813 int Pattern) {
4814 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
4815 return DAG.getConstant(1, DL, MVT::nxv1i1);
4816 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
4817 DAG.getTargetConstant(Pattern, DL, MVT::i32));
4818}
4819
4820static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned,
4821 bool IsLess, bool IsEqual) {
4822 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
4823 !isa<ConstantSDNode>(Op.getOperand(2)))
4824 return SDValue();
4825
4826 SDLoc dl(Op);
4827 APInt X = Op.getConstantOperandAPInt(1);
4828 APInt Y = Op.getConstantOperandAPInt(2);
4829 APInt NumActiveElems;
4830 bool Overflow;
4831 if (IsLess)
4832 NumActiveElems = IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
4833 else
4834 NumActiveElems = IsSigned ? X.ssub_ov(Y, Overflow) : X.usub_ov(Y, Overflow);
4835
4836 if (Overflow)
4837 return SDValue();
4838
4839 if (IsEqual) {
4840 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
4841 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
4842 : NumActiveElems.uadd_ov(One, Overflow);
4843 if (Overflow)
4844 return SDValue();
4845 }
4846
4847 std::optional<unsigned> PredPattern =
4849 unsigned MinSVEVectorSize = std::max(
4851 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
4852 if (PredPattern != std::nullopt &&
4853 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
4854 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
4855
4856 return SDValue();
4857}
4858
4859// Returns a safe bitcast between two scalable vector predicates, where
4860// any newly created lanes from a widening bitcast are defined as zero.
4862 SDLoc DL(Op);
4863 EVT InVT = Op.getValueType();
4864
4865 assert(InVT.getVectorElementType() == MVT::i1 &&
4866 VT.getVectorElementType() == MVT::i1 &&
4867 "Expected a predicate-to-predicate bitcast");
4869 InVT.isScalableVector() &&
4870 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
4871 "Only expect to cast between legal scalable predicate types!");
4872
4873 // Return the operand if the cast isn't changing type,
4874 // e.g. <n x 16 x i1> -> <n x 16 x i1>
4875 if (InVT == VT)
4876 return Op;
4877
4878 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
4879
4880 // We only have to zero the lanes if new lanes are being defined, e.g. when
4881 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
4882 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
4883 // we can return here.
4884 if (InVT.bitsGT(VT))
4885 return Reinterpret;
4886
4887 // Check if the other lanes are already known to be zeroed by
4888 // construction.
4890 return Reinterpret;
4891
4892 // Zero the newly introduced lanes.
4893 SDValue Mask = DAG.getConstant(1, DL, InVT);
4894 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
4895 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
4896}
4897
4898SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
4899 SDValue Chain, SDLoc DL,
4900 EVT VT) const {
4901 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
4903 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
4904 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
4907 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
4909 RetTy, Callee, std::move(Args));
4910 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4911 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
4912 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
4913 Mask);
4914}
4915
4916// Lower an SME LDR/STR ZA intrinsic
4917// Case 1: If the vector number (vecnum) is an immediate in range, it gets
4918// folded into the instruction
4919// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
4920// Case 2: If the vecnum is not an immediate, then it is used to modify the base
4921// and tile slice registers
4922// ldr(%tileslice, %ptr, %vecnum)
4923// ->
4924// %svl = rdsvl
4925// %ptr2 = %ptr + %svl * %vecnum
4926// %tileslice2 = %tileslice + %vecnum
4927// ldr [%tileslice2, 0], [%ptr2, 0]
4928// Case 3: If the vecnum is an immediate out of range, then the same is done as
4929// case 2, but the base and slice registers are modified by the greatest
4930// multiple of 15 lower than the vecnum and the remainder is folded into the
4931// instruction. This means that successive loads and stores that are offset from
4932// each other can share the same base and slice register updates.
4933// ldr(%tileslice, %ptr, 22)
4934// ldr(%tileslice, %ptr, 23)
4935// ->
4936// %svl = rdsvl
4937// %ptr2 = %ptr + %svl * 15
4938// %tileslice2 = %tileslice + 15
4939// ldr [%tileslice2, 7], [%ptr2, 7]
4940// ldr [%tileslice2, 8], [%ptr2, 8]
4941// Case 4: If the vecnum is an add of an immediate, then the non-immediate
4942// operand and the immediate can be folded into the instruction, like case 2.
4943// ldr(%tileslice, %ptr, %vecnum + 7)
4944// ldr(%tileslice, %ptr, %vecnum + 8)
4945// ->
4946// %svl = rdsvl
4947// %ptr2 = %ptr + %svl * %vecnum
4948// %tileslice2 = %tileslice + %vecnum
4949// ldr [%tileslice2, 7], [%ptr2, 7]
4950// ldr [%tileslice2, 8], [%ptr2, 8]
4951// Case 5: The vecnum being an add of an immediate out of range is also handled,
4952// in which case the same remainder logic as case 3 is used.
4954 SDLoc DL(N);
4955
4956 SDValue TileSlice = N->getOperand(2);
4957 SDValue Base = N->getOperand(3);
4958 SDValue VecNum = N->getOperand(4);
4959 int32_t ConstAddend = 0;
4960 SDValue VarAddend = VecNum;
4961
4962 // If the vnum is an add of an immediate, we can fold it into the instruction
4963 if (VecNum.getOpcode() == ISD::ADD &&
4964 isa<ConstantSDNode>(VecNum.getOperand(1))) {
4965 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
4966 VarAddend = VecNum.getOperand(0);
4967 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
4968 ConstAddend = ImmNode->getSExtValue();
4969 VarAddend = SDValue();
4970 }
4971
4972 int32_t ImmAddend = ConstAddend % 16;
4973 if (int32_t C = (ConstAddend - ImmAddend)) {
4974 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
4975 VarAddend = VarAddend
4976 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
4977 : CVal;
4978 }
4979
4980 if (VarAddend) {
4981 // Get the vector length that will be multiplied by vnum
4982 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
4983 DAG.getConstant(1, DL, MVT::i32));
4984
4985 // Multiply SVL and vnum then add it to the base
4986 SDValue Mul = DAG.getNode(
4987 ISD::MUL, DL, MVT::i64,
4988 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
4989 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
4990 // Just add vnum to the tileslice
4991 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
4992 }
4993
4995 DL, MVT::Other,
4996 {/*Chain=*/N.getOperand(0), TileSlice, Base,
4997 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
4998}
4999
5000SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5001 SelectionDAG &DAG) const {
5002 unsigned IntNo = Op.getConstantOperandVal(1);
5003 SDLoc DL(Op);
5004 switch (IntNo) {
5005 default:
5006 return SDValue(); // Don't custom lower most intrinsics.
5007 case Intrinsic::aarch64_prefetch: {
5008 SDValue Chain = Op.getOperand(0);
5009 SDValue Addr = Op.getOperand(2);
5010
5011 unsigned IsWrite = Op.getConstantOperandVal(3);
5012 unsigned Locality = Op.getConstantOperandVal(4);
5013 unsigned IsStream = Op.getConstantOperandVal(5);
5014 unsigned IsData = Op.getConstantOperandVal(6);
5015 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5016 (!IsData << 3) | // IsDataCache bit
5017 (Locality << 1) | // Cache level bits
5018 (unsigned)IsStream; // Stream bit
5019
5020 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5021 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5022 }
5023 case Intrinsic::aarch64_sme_str:
5024 case Intrinsic::aarch64_sme_ldr: {
5025 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5026 }
5027 case Intrinsic::aarch64_sme_za_enable:
5028 return DAG.getNode(
5029 AArch64ISD::SMSTART, DL, MVT::Other,
5030 Op->getOperand(0), // Chain
5031 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5032 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
5033 case Intrinsic::aarch64_sme_za_disable:
5034 return DAG.getNode(
5035 AArch64ISD::SMSTOP, DL, MVT::Other,
5036 Op->getOperand(0), // Chain
5037 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5038 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
5039 }
5040}
5041
5042SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5043 SelectionDAG &DAG) const {
5044 unsigned IntNo = Op.getConstantOperandVal(1);
5045 SDLoc DL(Op);
5046 switch (IntNo) {
5047 default:
5048 return SDValue(); // Don't custom lower most intrinsics.
5049 case Intrinsic::aarch64_mops_memset_tag: {
5050 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5051 SDValue Chain = Node->getChain();
5052 SDValue Dst = Op.getOperand(2);
5053 SDValue Val = Op.getOperand(3);
5054 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5055 SDValue Size = Op.getOperand(4);
5056 auto Alignment = Node->getMemOperand()->getAlign();
5057 bool IsVol = Node->isVolatile();
5058 auto DstPtrInfo = Node->getPointerInfo();
5059
5060 const auto &SDI =
5061 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5062 SDValue MS =
5063 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
5064 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
5065
5066 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5067 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5068 // LowerOperationWrapper will complain that the number of results has
5069 // changed.
5070 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
5071 }
5072 }
5073}
5074
5075SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5076 SelectionDAG &DAG) const {
5077 unsigned IntNo = Op.getConstantOperandVal(0);
5078 SDLoc dl(Op);
5079 switch (IntNo) {
5080 default: return SDValue(); // Don't custom lower most intrinsics.
5081 case Intrinsic::thread_pointer: {
5082 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5083 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
5084 }
5085 case Intrinsic::aarch64_neon_abs: {
5086 EVT Ty = Op.getValueType();
5087 if (Ty == MVT::i64) {
5088 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5089 Op.getOperand(1));
5090 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5091 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5092 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
5093 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
5094 } else {
5095 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
5096 }
5097 }
5098 case Intrinsic::aarch64_neon_pmull64: {
5099 SDValue LHS = Op.getOperand(1);
5100 SDValue RHS = Op.getOperand(2);
5101
5102 std::optional<uint64_t> LHSLane =
5104 std::optional<uint64_t> RHSLane =
5106
5107 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5108 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5109
5110 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5111 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5112 // which ISel recognizes better. For example, generate a ldr into d*
5113 // registers as opposed to a GPR load followed by a fmov.
5114 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5115 std::optional<uint64_t> OtherLane,
5116 const SDLoc &dl,
5117 SelectionDAG &DAG) -> SDValue {
5118 // If the operand is an higher half itself, rewrite it to
5119 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5120 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5121 if (NLane && *NLane == 1)
5122 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5123 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5124
5125 // Operand N is not a higher half but the other operand is.
5126 if (OtherLane && *OtherLane == 1) {
5127 // If this operand is a lower half, rewrite it to
5128 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5129 // align lanes of two operands. A roundtrip sequence (to move from lane
5130 // 1 to lane 0) is like this:
5131 // mov x8, v0.d[1]
5132 // fmov d0, x8
5133 if (NLane && *NLane == 0)
5134 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5135 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5136 N.getOperand(0),
5137 DAG.getConstant(0, dl, MVT::i64)),
5138 DAG.getConstant(1, dl, MVT::i64));
5139
5140 // Otherwise just dup from main to all lanes.
5141 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5142 }
5143
5144 // Neither operand is an extract of higher half, so codegen may just use
5145 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5146 assert(N.getValueType() == MVT::i64 &&
5147 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5148 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5149 };
5150
5151 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5152 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5153
5154 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
5155 }
5156 case Intrinsic::aarch64_neon_smax:
5157 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
5158 Op.getOperand(1), Op.getOperand(2));
5159 case Intrinsic::aarch64_neon_umax:
5160 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
5161 Op.getOperand(1), Op.getOperand(2));
5162 case Intrinsic::aarch64_neon_smin:
5163 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
5164 Op.getOperand(1), Op.getOperand(2));
5165 case Intrinsic::aarch64_neon_umin:
5166 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
5167 Op.getOperand(1), Op.getOperand(2));
5168 case Intrinsic::aarch64_neon_scalar_sqxtn:
5169 case Intrinsic::aarch64_neon_scalar_sqxtun:
5170 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5171 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5172 if (Op.getValueType() == MVT::i32)
5173 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5174 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5175 Op.getOperand(0),
5176 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5177 Op.getOperand(1))));
5178 return SDValue();
5179 }
5180 case Intrinsic::aarch64_sve_whilelo:
5181 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
5182 /*IsEqual=*/false);
5183 case Intrinsic::aarch64_sve_whilelt:
5184 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
5185 /*IsEqual=*/false);
5186 case Intrinsic::aarch64_sve_whilels:
5187 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
5188 /*IsEqual=*/true);
5189 case Intrinsic::aarch64_sve_whilele:
5190 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
5191 /*IsEqual=*/true);
5192 case Intrinsic::aarch64_sve_whilege:
5193 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
5194 /*IsEqual=*/true);
5195 case Intrinsic::aarch64_sve_whilegt:
5196 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
5197 /*IsEqual=*/false);
5198 case Intrinsic::aarch64_sve_whilehs:
5199 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
5200 /*IsEqual=*/true);
5201 case Intrinsic::aarch64_sve_whilehi:
5202 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
5203 /*IsEqual=*/false);
5204 case Intrinsic::aarch64_sve_sunpkhi:
5205 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
5206 Op.getOperand(1));
5207 case Intrinsic::aarch64_sve_sunpklo:
5208 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
5209 Op.getOperand(1));
5210 case Intrinsic::aarch64_sve_uunpkhi:
5211 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
5212 Op.getOperand(1));
5213 case Intrinsic::aarch64_sve_uunpklo:
5214 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
5215 Op.getOperand(1));
5216 case Intrinsic::aarch64_sve_clasta_n:
5217 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5218 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5219 case Intrinsic::aarch64_sve_clastb_n:
5220 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5221 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5222 case Intrinsic::aarch64_sve_lasta:
5223 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5224 Op.getOperand(1), Op.getOperand(2));
5225 case Intrinsic::aarch64_sve_lastb:
5226 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5227 Op.getOperand(1), Op.getOperand(2));
5228 case Intrinsic::aarch64_sve_rev:
5229 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5230 Op.getOperand(1));
5231 case Intrinsic::aarch64_sve_tbl:
5232 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5233 Op.getOperand(1), Op.getOperand(2));
5234 case Intrinsic::aarch64_sve_trn1:
5235 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5236 Op.getOperand(1), Op.getOperand(2));
5237 case Intrinsic::aarch64_sve_trn2:
5238 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5239 Op.getOperand(1), Op.getOperand(2));
5240 case Intrinsic::aarch64_sve_uzp1:
5241 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5242 Op.getOperand(1), Op.getOperand(2));
5243 case Intrinsic::aarch64_sve_uzp2:
5244 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5245 Op.getOperand(1), Op.getOperand(2));
5246 case Intrinsic::aarch64_sve_zip1:
5247 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5248 Op.getOperand(1), Op.getOperand(2));
5249 case Intrinsic::aarch64_sve_zip2:
5250 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5251 Op.getOperand(1), Op.getOperand(2));
5252 case Intrinsic::aarch64_sve_splice:
5253 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5254 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5255 case Intrinsic::aarch64_sve_ptrue:
5256 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
5257 case Intrinsic::aarch64_sve_clz:
5258 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5259 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5260 case Intrinsic::aarch64_sme_cntsb:
5261 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5262 DAG.getConstant(1, dl, MVT::i32));
5263 case Intrinsic::aarch64_sme_cntsh: {
5264 SDValue One = DAG.getConstant(1, dl, MVT::i32);
5265 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5266 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5267 }
5268 case Intrinsic::aarch64_sme_cntsw: {
5269 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5270 DAG.getConstant(1, dl, MVT::i32));
5271 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5272 DAG.getConstant(2, dl, MVT::i32));
5273 }
5274 case Intrinsic::aarch64_sme_cntsd: {
5275 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5276 DAG.getConstant(1, dl, MVT::i32));
5277 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5278 DAG.getConstant(3, dl, MVT::i32));
5279 }
5280 case Intrinsic::aarch64_sve_cnt: {
5281 SDValue Data = Op.getOperand(3);
5282 // CTPOP only supports integer operands.
5283 if (Data.getValueType().isFloatingPoint())
5284 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5285 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5286 Op.getOperand(2), Data, Op.getOperand(1));
5287 }
5288 case Intrinsic::aarch64_sve_dupq_lane:
5289 return LowerDUPQLane(Op, DAG);
5290 case Intrinsic::aarch64_sve_convert_from_svbool:
5291 if (Op.getValueType() == MVT::aarch64svcount)
5292 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
5293 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5294 case Intrinsic::aarch64_sve_convert_to_svbool:
5295 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5296 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5297 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5298 case Intrinsic::aarch64_sve_fneg:
5299 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5300 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5301 case Intrinsic::aarch64_sve_frintp:
5302 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5303 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5304 case Intrinsic::aarch64_sve_frintm:
5305 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5306 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5307 case Intrinsic::aarch64_sve_frinti:
5308 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5309 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5310 case Intrinsic::aarch64_sve_frintx:
5311 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5312 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5313 case Intrinsic::aarch64_sve_frinta:
5314 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5315 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5316 case Intrinsic::aarch64_sve_frintn:
5317 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5318 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5319 case Intrinsic::aarch64_sve_frintz:
5320 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5321 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5322 case Intrinsic::aarch64_sve_ucvtf:
5324 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5325 Op.getOperand(1));
5326 case Intrinsic::aarch64_sve_scvtf:
5328 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5329 Op.getOperand(1));
5330 case Intrinsic::aarch64_sve_fcvtzu:
5332 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5333 Op.getOperand(1));
5334 case Intrinsic::aarch64_sve_fcvtzs:
5336 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5337 Op.getOperand(1));
5338 case Intrinsic::aarch64_sve_fsqrt:
5339 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5340 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5341 case Intrinsic::aarch64_sve_frecpx:
5342 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5343 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5344 case Intrinsic::aarch64_sve_frecpe_x:
5345 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5346 Op.getOperand(1));
5347 case Intrinsic::aarch64_sve_frecps_x:
5348 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5349 Op.getOperand(1), Op.getOperand(2));
5350 case Intrinsic::aarch64_sve_frsqrte_x:
5351 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5352 Op.getOperand(1));
5353 case Intrinsic::aarch64_sve_frsqrts_x:
5354 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5355 Op.getOperand(1), Op.getOperand(2));
5356 case Intrinsic::aarch64_sve_fabs:
5357 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5358 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5359 case Intrinsic::aarch64_sve_abs:
5360 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5361 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5362 case Intrinsic::aarch64_sve_neg:
5363 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5364 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5365 case Intrinsic::aarch64_sve_insr: {
5366 SDValue Scalar = Op.getOperand(2);
5367 EVT ScalarTy = Scalar.getValueType();
5368 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5369 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5370
5371 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5372 Op.getOperand(1), Scalar);
5373 }
5374 case Intrinsic::aarch64_sve_rbit:
5376 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5377 Op.getOperand(1));
5378 case Intrinsic::aarch64_sve_revb:
5379 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5380 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5381 case Intrinsic::aarch64_sve_revh:
5382 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5383 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5384 case Intrinsic::aarch64_sve_revw:
5385 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5386 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5387 case Intrinsic::aarch64_sve_revd:
5388 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5389 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5390 case Intrinsic::aarch64_sve_sxtb:
5391 return DAG.getNode(
5393 Op.getOperand(2), Op.getOperand(3),
5394 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5395 Op.getOperand(1));
5396 case Intrinsic::aarch64_sve_sxth:
5397 return DAG.getNode(
5399 Op.getOperand(2), Op.getOperand(3),
5400 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5401 Op.getOperand(1));
5402 case Intrinsic::aarch64_sve_sxtw:
5403 return DAG.getNode(
5405 Op.getOperand(2), Op.getOperand(3),
5406 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5407 Op.getOperand(1));
5408 case Intrinsic::aarch64_sve_uxtb:
5409 return DAG.getNode(
5411 Op.getOperand(2), Op.getOperand(3),
5412 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5413 Op.getOperand(1));
5414 case Intrinsic::aarch64_sve_uxth:
5415 return DAG.getNode(
5417 Op.getOperand(2), Op.getOperand(3),
5418 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5419 Op.getOperand(1));
5420 case Intrinsic::aarch64_sve_uxtw:
5421 return DAG.getNode(
5423 Op.getOperand(2), Op.getOperand(3),
5424 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5425 Op.getOperand(1));
5426 case Intrinsic::localaddress: {
5427 const auto &MF = DAG.getMachineFunction();
5428 const auto *RegInfo = Subtarget->getRegisterInfo();
5429 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5430 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5431 Op.getSimpleValueType());
5432 }
5433
5434 case Intrinsic::eh_recoverfp: {
5435 // FIXME: This needs to be implemented to correctly handle highly aligned
5436 // stack objects. For now we simply return the incoming FP. Refer D53541
5437 // for more details.
5438 SDValue FnOp = Op.getOperand(1);
5439 SDValue IncomingFPOp = Op.getOperand(2);
5440 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5441 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5442 if (!Fn)
5444 "llvm.eh.recoverfp must take a function as the first argument");
5445 return IncomingFPOp;
5446 }
5447
5448 case Intrinsic::aarch64_neon_vsri:
5449 case Intrinsic::aarch64_neon_vsli:
5450 case Intrinsic::aarch64_sve_sri:
5451 case Intrinsic::aarch64_sve_sli: {
5452 EVT Ty = Op.getValueType();
5453
5454 if (!Ty.isVector())
5455 report_fatal_error("Unexpected type for aarch64_neon_vsli");
5456
5457 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5458
5459 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5460 IntNo == Intrinsic::aarch64_sve_sri;
5461 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5462 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5463 Op.getOperand(3));
5464 }
5465
5466 case Intrinsic::aarch64_neon_srhadd:
5467 case Intrinsic::aarch64_neon_urhadd:
5468 case Intrinsic::aarch64_neon_shadd:
5469 case Intrinsic::aarch64_neon_uhadd: {
5470 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5471 IntNo == Intrinsic::aarch64_neon_shadd);
5472 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5473 IntNo == Intrinsic::aarch64_neon_urhadd);
5474 unsigned Opcode = IsSignedAdd
5475 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5476 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5477 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5478 Op.getOperand(2));
5479 }
5480 case Intrinsic::aarch64_neon_saddlp:
5481 case Intrinsic::aarch64_neon_uaddlp: {
5482 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5485 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
5486 }
5487 case Intrinsic::aarch64_neon_sdot:
5488 case Intrinsic::aarch64_neon_udot:
5489 case Intrinsic::aarch64_sve_sdot:
5490 case Intrinsic::aarch64_sve_udot: {
5491 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5492 IntNo == Intrinsic::aarch64_sve_udot)
5495 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5496 Op.getOperand(2), Op.getOperand(3));
5497 }
5498 case Intrinsic::get_active_lane_mask: {
5499 SDValue ID =
5500 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5501 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
5502 Op.getOperand(1), Op.getOperand(2));
5503 }
5504 case Intrinsic::aarch64_neon_uaddlv: {
5505 EVT OpVT = Op.getOperand(1).getValueType();
5506 EVT ResVT = Op.getValueType();
5507 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
5508 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
5509 // In order to avoid insert_subvector, used v4i32 than v2i32.
5510 SDValue UADDLV =
5511 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
5512 SDValue EXTRACT_VEC_ELT =
5513 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
5514 DAG.getConstant(0, dl, MVT::i64));
5515 return EXTRACT_VEC_ELT;
5516 }
5517 return SDValue();
5518 }
5519 case Intrinsic::experimental_cttz_elts: {
5520 SDValue NewCttzElts =
5521 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5522
5523 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
5524 }
5525 }
5526}
5527
5528bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5529 if (VT.getVectorElementType() == MVT::i8 ||
5530 VT.getVectorElementType() == MVT::i16) {
5531 EltTy = MVT::i32;
5532 return true;
5533 }
5534 return false;
5535}
5536
5537bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
5538 EVT DataVT) const {
5539 const EVT IndexVT = Extend.getOperand(0).getValueType();
5540 // SVE only supports implicit extension of 32-bit indices.
5541 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5542 return false;
5543
5544 // Indices cannot be smaller than the main data type.
5545 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5546 return false;
5547
5548 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5549 // element container type, which would violate the previous clause.
5550 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5551}
5552
5553bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5554 EVT ExtVT = ExtVal.getValueType();
5555 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
5556 return false;
5557
5558 // It may be worth creating extending masked loads if there are multiple
5559 // masked loads using the same predicate. That way we'll end up creating
5560 // extending masked loads that may then get split by the legaliser. This
5561 // results in just one set of predicate unpacks at the start, instead of
5562 // multiple sets of vector unpacks after each load.
5563 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
5564 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
5565 // Disable extending masked loads for fixed-width for now, since the code
5566 // quality doesn't look great.
5567 if (!ExtVT.isScalableVector())
5568 return false;
5569
5570 unsigned NumExtMaskedLoads = 0;
5571 for (auto *U : Ld->getMask()->uses())
5572 if (isa<MaskedLoadSDNode>(U))
5573 NumExtMaskedLoads++;
5574
5575 if (NumExtMaskedLoads <= 1)
5576 return false;
5577 }
5578 }
5579
5580 return true;
5581}
5582
5583unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5584 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5585 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5587 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5589 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5591 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5593 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5595 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5597 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5599 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5601 };
5602 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
5603 return AddrModes.find(Key)->second;
5604}
5605
5606unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5607 switch (Opcode) {
5608 default:
5609 llvm_unreachable("unimplemented opcode");
5610 return Opcode;
5625 }
5626}
5627
5628SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5629 SelectionDAG &DAG) const {
5630 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
5631
5632 SDLoc DL(Op);
5633 SDValue Chain = MGT->getChain();
5634 SDValue PassThru = MGT->getPassThru();
5635 SDValue Mask = MGT->getMask();
5636 SDValue BasePtr = MGT->getBasePtr();
5637 SDValue Index = MGT->getIndex();
5638 SDValue Scale = MGT->getScale();
5639 EVT VT = Op.getValueType();
5640 EVT MemVT = MGT->getMemoryVT();
5641 ISD::LoadExtType ExtType = MGT->getExtensionType();
5642 ISD::MemIndexType IndexType = MGT->getIndexType();
5643
5644 // SVE supports zero (and so undef) passthrough values only, everything else
5645 // must be handled manually by an explicit select on the load's output.
5646 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
5647 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5648 SDValue Load =
5649 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5650 MGT->getMemOperand(), IndexType, ExtType);
5651 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5652 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
5653 }
5654
5655 bool IsScaled = MGT->isIndexScaled();
5656 bool IsSigned = MGT->isIndexSigned();
5657
5658 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5659 // must be calculated before hand.
5660 uint64_t ScaleVal = Scale->getAsZExtVal();
5661 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5662 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5663 EVT IndexVT = Index.getValueType();
5664 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5665 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5666 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5667
5668 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5669 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5670 MGT->getMemOperand(), IndexType, ExtType);
5671 }
5672
5673 // Lower fixed length gather to a scalable equivalent.
5674 if (VT.isFixedLengthVector()) {
5675 assert(Subtarget->useSVEForFixedLengthVectors() &&
5676 "Cannot lower when not using SVE for fixed vectors!");
5677
5678 // NOTE: Handle floating-point as if integer then bitcast the result.
5680 MemVT = MemVT.changeVectorElementTypeToInteger();
5681
5682 // Find the smallest integer fixed length vector we can use for the gather.
5683 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5684 if (DataVT.getVectorElementType() == MVT::i64 ||
5685 Index.getValueType().getVectorElementType() == MVT::i64 ||
5686 Mask.getValueType().getVectorElementType() == MVT::i64)
5687 PromotedVT = VT.changeVectorElementType(MVT::i64);
5688
5689 // Promote vector operands except for passthrough, which we know is either
5690 // undef or zero, and thus best constructed directly.
5691 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5692 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5693 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5694
5695 // A promoted result type forces the need for an extending load.
5696 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5697 ExtType = ISD::EXTLOAD;
5698
5699 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5700
5701 // Convert fixed length vector operands to scalable.
5702 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5703 Index = convertToScalableVector(DAG, ContainerVT, Index);
5705 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
5706 : DAG.getConstant(0, DL, ContainerVT);
5707
5708 // Emit equivalent scalable vector gather.
5709 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5710 SDValue Load =
5711 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5712 Ops, MGT->getMemOperand(), IndexType, ExtType);
5713
5714 // Extract fixed length data then convert to the required result type.
5715 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
5716 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
5717 if (VT.isFloatingPoint())
5718 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
5719
5720 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5721 }
5722
5723 // Everything else is legal.
5724 return Op;
5725}
5726
5727SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5728 SelectionDAG &DAG) const {
5729 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
5730
5731 SDLoc DL(Op);
5732 SDValue Chain = MSC->getChain();
5733 SDValue StoreVal = MSC->getValue();
5734 SDValue Mask = MSC->getMask();
5735 SDValue BasePtr = MSC->getBasePtr();
5736 SDValue Index = MSC->getIndex();
5737 SDValue Scale = MSC->getScale();
5738 EVT VT = StoreVal.getValueType();
5739 EVT MemVT = MSC->getMemoryVT();
5740 ISD::MemIndexType IndexType = MSC->getIndexType();
5741 bool Truncating = MSC->isTruncatingStore();
5742
5743 bool IsScaled = MSC->isIndexScaled();
5744 bool IsSigned = MSC->isIndexSigned();
5745
5746 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5747 // must be calculated before hand.
5748 uint64_t ScaleVal = Scale->getAsZExtVal();
5749 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5750 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5751 EVT IndexVT = Index.getValueType();
5752 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5753 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5754 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5755
5756 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5757 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5758 MSC->getMemOperand(), IndexType, Truncating);
5759 }
5760
5761 // Lower fixed length scatter to a scalable equivalent.
5762 if (VT.isFixedLengthVector()) {
5763 assert(Subtarget->useSVEForFixedLengthVectors() &&
5764 "Cannot lower when not using SVE for fixed vectors!");
5765
5766 // Once bitcast we treat floating-point scatters as if integer.
5767 if (VT.isFloatingPoint()) {
5769 MemVT = MemVT.changeVectorElementTypeToInteger();
5770 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
5771 }
5772
5773 // Find the smallest integer fixed length vector we can use for the scatter.
5774 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5775 if (VT.getVectorElementType() == MVT::i64 ||
5776 Index.getValueType().getVectorElementType() == MVT::i64 ||
5777 Mask.getValueType().getVectorElementType() == MVT::i64)
5778 PromotedVT = VT.changeVectorElementType(MVT::i64);
5779
5780 // Promote vector operands.
5781 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5782 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5783 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5784 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
5785
5786 // A promoted value type forces the need for a truncating store.
5787 if (PromotedVT != VT)
5788 Truncating = true;
5789
5790 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5791
5792 // Convert fixed length vector operands to scalable.
5793 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5794 Index = convertToScalableVector(DAG, ContainerVT, Index);
5796 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
5797
5798 // Emit equivalent scalable vector scatter.
5799 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5800 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5801 MSC->getMemOperand(), IndexType, Truncating);
5802 }
5803
5804 // Everything else is legal.
5805 return Op;
5806}
5807
5808SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
5809 SDLoc DL(Op);
5810 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
5811 assert(LoadNode && "Expected custom lowering of a masked load node");
5812 EVT VT = Op->getValueType(0);
5813
5814 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
5815 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
5816
5817 SDValue PassThru = LoadNode->getPassThru();
5818 SDValue Mask = LoadNode->getMask();
5819
5820 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
5821 return Op;
5822
5824 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
5825 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
5826 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
5827 LoadNode->getExtensionType());
5828
5829 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5830
5831 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5832}
5833
5834// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
5836 EVT VT, EVT MemVT,
5837 SelectionDAG &DAG) {
5838 assert(VT.isVector() && "VT should be a vector type");
5839 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
5840
5841 SDValue Value = ST->getValue();
5842
5843 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
5844 // the word lane which represent the v4i8 subvector. It optimizes the store
5845 // to:
5846 //
5847 // xtn v0.8b, v0.8h
5848 // str s0, [x0]
5849
5850 SDValue Undef = DAG.getUNDEF(MVT::i16);
5851 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
5852 {Undef, Undef, Undef, Undef});
5853
5854 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
5855 Value, UndefVec);
5856 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
5857
5858 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
5859 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
5860 Trunc, DAG.getConstant(0, DL, MVT::i64));
5861
5862 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
5863 ST->getBasePtr(), ST->getMemOperand());
5864}
5865
5866// Custom lowering for any store, vector or scalar and/or default or with
5867// a truncate operations. Currently only custom lower truncate operation
5868// from vector v4i16 to v4i8 or volatile stores of i128.
5869SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
5870 SelectionDAG &DAG) const {
5871 SDLoc Dl(Op);
5872 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
5873 assert (StoreNode && "Can only custom lower store nodes");
5874
5875 SDValue Value = StoreNode->getValue();
5876
5877 EVT VT = Value.getValueType();
5878 EVT MemVT = StoreNode->getMemoryVT();
5879
5880 if (VT.isVector()) {
5882 VT,
5883 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
5884 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
5885
5886 unsigned AS = StoreNode->getAddressSpace();
5887 Align Alignment = StoreNode->getAlign();
5888 if (Alignment < MemVT.getStoreSize() &&
5889 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
5890 StoreNode->getMemOperand()->getFlags(),
5891 nullptr)) {
5892 return scalarizeVectorStore(StoreNode, DAG);
5893 }
5894
5895 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
5896 MemVT == MVT::v4i8) {
5897 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
5898 }
5899 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
5900 // the custom lowering, as there are no un-paired non-temporal stores and
5901 // legalization will break up 256 bit inputs.
5903 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
5904 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
5905 (MemVT.getScalarSizeInBits() == 8u ||
5906 MemVT.getScalarSizeInBits() == 16u ||
5907 MemVT.getScalarSizeInBits() == 32u ||
5908 MemVT.getScalarSizeInBits() == 64u)) {
5909 SDValue Lo =
5912 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
5913 SDValue Hi =
5916 StoreNode->getValue(),
5917 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
5919 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
5920 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
5921 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
5922 return Result;
5923 }
5924 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
5925 return LowerStore128(Op, DAG);
5926 } else if (MemVT == MVT::i64x8) {
5927 SDValue Value = StoreNode->getValue();
5928 assert(Value->getValueType(0) == MVT::i64x8);
5929 SDValue Chain = StoreNode->getChain();
5930 SDValue Base = StoreNode->getBasePtr();
5931 EVT PtrVT = Base.getValueType();
5932 for (unsigned i = 0; i < 8; i++) {
5933 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
5934 Value, DAG.getConstant(i, Dl, MVT::i32));
5935 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
5936 DAG.getConstant(i * 8, Dl, PtrVT));
5937 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
5938 StoreNode->getOriginalAlign());
5939 }
5940 return Chain;
5941 }
5942
5943 return SDValue();
5944}
5945
5946/// Lower atomic or volatile 128-bit stores to a single STP instruction.
5947SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
5948 SelectionDAG &DAG) const {
5949 MemSDNode *StoreNode = cast<MemSDNode>(Op);
5950 assert(StoreNode->getMemoryVT() == MVT::i128);
5951 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
5952
5953 bool IsStoreRelease =
5955 if (StoreNode->isAtomic())
5956 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
5957 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
5960
5961 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
5962 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
5963 ? StoreNode->getOperand(1)
5964 : StoreNode->getOperand(2);
5965 SDLoc DL(Op);
5966 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
5967 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
5968 if (DAG.getDataLayout().isBigEndian())
5969 std::swap(StoreValue.first, StoreValue.second);
5971 Opcode, DL, DAG.getVTList(MVT::Other),
5972 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
5973 StoreNode->getBasePtr()},
5974 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
5975 return Result;
5976}
5977
5978SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
5979 SelectionDAG &DAG) const {
5980 SDLoc DL(Op);
5981 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
5982 assert(LoadNode && "Expected custom lowering of a load node");
5983
5984 if (LoadNode->getMemoryVT() == MVT::i64x8) {
5986 SDValue Base = LoadNode->getBasePtr();
5987 SDValue Chain = LoadNode->getChain();
5988 EVT PtrVT = Base.getValueType();
5989 for (unsigned i = 0; i < 8; i++) {
5990 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
5991 DAG.getConstant(i * 8, DL, PtrVT));
5992 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
5993 LoadNode->getPointerInfo(),
5994 LoadNode->getOriginalAlign());
5995 Ops.push_back(Part);
5996 Chain = SDValue(Part.getNode(), 1);
5997 }
5998 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
5999 return DAG.getMergeValues({Loaded, Chain}, DL);
6000 }
6001
6002 // Custom lowering for extending v4i8 vector loads.
6003 EVT VT = Op->getValueType(0);
6004 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6005
6006 if (LoadNode->getMemoryVT() != MVT::v4i8)
6007 return SDValue();
6008
6009 unsigned ExtType;
6010 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6011 ExtType = ISD::SIGN_EXTEND;
6012 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6013 LoadNode->getExtensionType() == ISD::EXTLOAD)
6014 ExtType = ISD::ZERO_EXTEND;
6015 else
6016 return SDValue();
6017
6018 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6019 LoadNode->getBasePtr(), MachinePointerInfo());
6020 SDValue Chain = Load.getValue(1);
6021 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6022 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6023 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6024 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6025 DAG.getConstant(0, DL, MVT::i64));
6026 if (VT == MVT::v4i32)
6027 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6028 return DAG.getMergeValues({Ext, Chain}, DL);
6029}
6030
6031// Generate SUBS and CSEL for integer abs.
6032SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6033 MVT VT = Op.getSimpleValueType();
6034
6035 if (VT.isVector())
6036 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
6037
6038 SDLoc DL(Op);
6039 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
6040 Op.getOperand(0));
6041 // Generate SUBS & CSEL.
6042 SDValue Cmp =
6043 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6044 Op.getOperand(0), DAG.getConstant(0, DL, VT));
6045 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6046 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6047 Cmp.getValue(1));
6048}
6049
6051 SDValue Chain = Op.getOperand(0);
6052 SDValue Cond = Op.getOperand(1);
6053 SDValue Dest = Op.getOperand(2);
6054
6056 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
6057 SDLoc dl(Op);
6058 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6059 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6060 Cmp);
6061 }
6062
6063 return SDValue();
6064}
6065
6066// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6067// FSHL is converted to FSHR before deciding what to do with it
6069 SDValue Shifts = Op.getOperand(2);
6070 // Check if the shift amount is a constant
6071 // If opcode is FSHL, convert it to FSHR
6072 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
6073 SDLoc DL(Op);
6074 MVT VT = Op.getSimpleValueType();
6075
6076 if (Op.getOpcode() == ISD::FSHL) {
6077 unsigned int NewShiftNo =
6078 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6079 return DAG.getNode(
6080 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
6081 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
6082 } else if (Op.getOpcode() == ISD::FSHR) {
6083 return Op;
6084 }
6085 }
6086
6087 return SDValue();
6088}
6089
6091 SDValue X = Op.getOperand(0);
6092 EVT XScalarTy = X.getValueType();
6093 SDValue Exp = Op.getOperand(1);
6094
6095 SDLoc DL(Op);
6096 EVT XVT, ExpVT;
6097 switch (Op.getSimpleValueType().SimpleTy) {
6098 default:
6099 return SDValue();
6100 case MVT::f16:
6101 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6102 [[fallthrough]];
6103 case MVT::f32:
6104 XVT = MVT::nxv4f32;
6105 ExpVT = MVT::nxv4i32;
6106 break;
6107 case MVT::f64:
6108 XVT = MVT::nxv2f64;
6109 ExpVT = MVT::nxv2i64;
6110 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6111 break;
6112 }
6113
6114 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6115 SDValue VX =
6116 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
6117 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
6118 DAG.getUNDEF(ExpVT), Exp, Zero);
6119 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6120 AArch64SVEPredPattern::all);
6121 SDValue FScale =
6123 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6124 VPg, VX, VExp);
6125 SDValue Final =
6126 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
6127 if (X.getValueType() != XScalarTy)
6128 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
6129 DAG.getIntPtrConstant(1, SDLoc(Op)));
6130 return Final;
6131}
6132
6134 SelectionDAG &DAG) const {
6135 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6136 LLVM_DEBUG(Op.dump());
6137
6138 switch (Op.getOpcode()) {
6139 default:
6140 llvm_unreachable("unimplemented operand");
6141 return SDValue();
6142 case ISD::BITCAST:
6143 return LowerBITCAST(Op, DAG);
6144 case ISD::GlobalAddress:
6145 return LowerGlobalAddress(Op, DAG);
6147 return LowerGlobalTLSAddress(Op, DAG);
6148 case ISD::SETCC:
6149 case ISD::STRICT_FSETCC:
6151 return LowerSETCC(Op, DAG);
6152 case ISD::SETCCCARRY:
6153 return LowerSETCCCARRY(Op, DAG);
6154 case ISD::BRCOND:
6155 return LowerBRCOND(Op, DAG);
6156 case ISD::BR_CC:
6157 return LowerBR_CC(Op, DAG);
6158 case ISD::SELECT:
6159 return LowerSELECT(Op, DAG);
6160 case ISD::SELECT_CC:
6161 return LowerSELECT_CC(Op, DAG);
6162 case ISD::JumpTable:
6163 return LowerJumpTable(Op, DAG);
6164 case ISD::BR_JT:
6165 return LowerBR_JT(Op, DAG);
6166 case ISD::ConstantPool:
6167 return LowerConstantPool(Op, DAG);
6168 case ISD::BlockAddress:
6169 return LowerBlockAddress(Op, DAG);
6170 case ISD::VASTART:
6171 return LowerVASTART(Op, DAG);
6172 case ISD::VACOPY:
6173 return LowerVACOPY(Op, DAG);
6174 case ISD::VAARG:
6175 return LowerVAARG(Op, DAG);
6176 case ISD::UADDO_CARRY:
6177 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
6178 case ISD::USUBO_CARRY:
6179 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
6180 case ISD::SADDO_CARRY:
6181 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
6182 case ISD::SSUBO_CARRY:
6183 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
6184 case ISD::SADDO:
6185 case ISD::UADDO:
6186 case ISD::SSUBO:
6187 case ISD::USUBO:
6188 case ISD::SMULO:
6189 case ISD::UMULO:
6190 return LowerXALUO(Op, DAG);
6191 case ISD::FADD:
6192 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
6193 case ISD::FSUB:
6194 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
6195 case ISD::FMUL:
6196 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
6197 case ISD::FMA:
6198 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
6199 case ISD::FDIV:
6200 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
6201 case ISD::FNEG:
6202 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
6203 case ISD::FCEIL:
6204 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
6205 case ISD::FFLOOR:
6206 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6207 case ISD::FNEARBYINT:
6208 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6209 case ISD::FRINT:
6210 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
6211 case ISD::FROUND:
6212 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
6213 case ISD::FROUNDEVEN:
6214 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6215 case ISD::FTRUNC:
6216 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6217 case ISD::FSQRT:
6218 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
6219 case ISD::FABS:
6220 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
6221 case ISD::FP_ROUND:
6223 return LowerFP_ROUND(Op, DAG);
6224 case ISD::FP_EXTEND:
6225 return LowerFP_EXTEND(Op, DAG);
6226 case ISD::FRAMEADDR:
6227 return LowerFRAMEADDR(Op, DAG);
6228 case ISD::SPONENTRY:
6229 return LowerSPONENTRY(Op, DAG);
6230 case ISD::RETURNADDR:
6231 return LowerRETURNADDR(Op, DAG);
6233 return LowerADDROFRETURNADDR(Op, DAG);
6235 return LowerCONCAT_VECTORS(Op, DAG);
6237 return LowerINSERT_VECTOR_ELT(Op, DAG);
6239 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6240 case ISD::BUILD_VECTOR:
6241 return LowerBUILD_VECTOR(Op, DAG);
6243 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6245 return LowerVECTOR_SHUFFLE(Op, DAG);
6246 case ISD::SPLAT_VECTOR:
6247 return LowerSPLAT_VECTOR(Op, DAG);
6249 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6251 return LowerINSERT_SUBVECTOR(Op, DAG);
6252 case ISD::SDIV:
6253 case ISD::UDIV:
6254 return LowerDIV(Op, DAG);
6255 case ISD::SMIN:
6256 case ISD::UMIN:
6257 case ISD::SMAX:
6258 case ISD::UMAX:
6259 return LowerMinMax(Op, DAG);
6260 case ISD::SRA:
6261 case ISD::SRL:
6262 case ISD::SHL:
6263 return LowerVectorSRA_SRL_SHL(Op, DAG);
6264 case ISD::SHL_PARTS:
6265 case ISD::SRL_PARTS:
6266 case ISD::SRA_PARTS:
6267 return LowerShiftParts(Op, DAG);
6268 case ISD::CTPOP:
6269 case ISD::PARITY:
6270 return LowerCTPOP_PARITY(Op, DAG);
6271 case ISD::FCOPYSIGN:
6272 return LowerFCOPYSIGN(Op, DAG);
6273 case ISD::OR:
6274 return LowerVectorOR(Op, DAG);
6275 case ISD::XOR:
6276 return LowerXOR(Op, DAG);
6277 case ISD::PREFETCH:
6278 return LowerPREFETCH(Op, DAG);
6279 case ISD::SINT_TO_FP:
6280 case ISD::UINT_TO_FP:
6283 return LowerINT_TO_FP(Op, DAG);
6284 case ISD::FP_TO_SINT:
6285 case ISD::FP_TO_UINT:
6288 return LowerFP_TO_INT(Op, DAG);
6291 return LowerFP_TO_INT_SAT(Op, DAG);
6292 case ISD::FSINCOS:
6293 return LowerFSINCOS(Op, DAG);
6294 case ISD::GET_ROUNDING:
6295 return LowerGET_ROUNDING(Op, DAG);
6296 case ISD::SET_ROUNDING:
6297 return LowerSET_ROUNDING(Op, DAG);
6298 case ISD::MUL:
6299 return LowerMUL(Op, DAG);
6300 case ISD::MULHS:
6301 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
6302 case ISD::MULHU:
6303 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
6305 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6307 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6309 return LowerINTRINSIC_VOID(Op, DAG);
6310 case ISD::ATOMIC_STORE:
6311 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6312 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6313 return LowerStore128(Op, DAG);
6314 }
6315 return SDValue();
6316 case ISD::STORE:
6317 return LowerSTORE(Op, DAG);
6318 case ISD::MSTORE:
6319 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6320 case ISD::MGATHER:
6321 return LowerMGATHER(Op, DAG);
6322 case ISD::MSCATTER:
6323 return LowerMSCATTER(Op, DAG);
6325 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
6326 case ISD::VECREDUCE_ADD:
6327 case ISD::VECREDUCE_AND:
6328 case ISD::VECREDUCE_OR:
6329 case ISD::VECREDUCE_XOR:
6339 return LowerVECREDUCE(Op, DAG);
6341 return LowerATOMIC_LOAD_AND(Op, DAG);
6343 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6344 case ISD::VSCALE:
6345 return LowerVSCALE(Op, DAG);
6346 case ISD::ANY_EXTEND:
6347 case ISD::SIGN_EXTEND:
6348 case ISD::ZERO_EXTEND:
6349 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6351 // Only custom lower when ExtraVT has a legal byte based element type.
6352 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6353 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6354 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6355 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6356 return SDValue();
6357
6358 return LowerToPredicatedOp(Op, DAG,
6360 }
6361 case ISD::TRUNCATE:
6362 return LowerTRUNCATE(Op, DAG);
6363 case ISD::MLOAD:
6364 return LowerMLOAD(Op, DAG);
6365 case ISD::LOAD:
6366 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6367 !Subtarget->isNeonAvailable()))
6368 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6369 return LowerLOAD(Op, DAG);
6370 case ISD::ADD:
6371 case ISD::AND:
6372 case ISD::SUB:
6373 return LowerToScalableOp(Op, DAG);
6374 case ISD::FMAXIMUM:
6375 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
6376 case ISD::FMAXNUM:
6377 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
6378 case ISD::FMINIMUM:
6379 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
6380 case ISD::FMINNUM:
6381 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
6382 case ISD::VSELECT:
6383 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6384 case ISD::ABS:
6385 return LowerABS(Op, DAG);
6386 case ISD::ABDS:
6387 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
6388 case ISD::ABDU:
6389 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
6390 case ISD::AVGFLOORS:
6391 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
6392 case ISD::AVGFLOORU:
6393 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
6394 case ISD::AVGCEILS:
6395 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
6396 case ISD::AVGCEILU:
6397 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
6398 case ISD::BITREVERSE:
6399 return LowerBitreverse(Op, DAG);
6400 case ISD::BSWAP:
6401 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
6402 case ISD::CTLZ:
6403 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
6404 case ISD::CTTZ:
6405 return LowerCTTZ(Op, DAG);
6406 case ISD::VECTOR_SPLICE:
6407 return LowerVECTOR_SPLICE(Op, DAG);
6409 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6411 return LowerVECTOR_INTERLEAVE(Op, DAG);
6412 case ISD::LROUND:
6413 case ISD::LLROUND:
6414 case ISD::LRINT:
6415 case ISD::LLRINT: {
6416 assert(Op.getOperand(0).getValueType() == MVT::f16 &&
6417 "Expected custom lowering of rounding operations only for f16");
6418 SDLoc DL(Op);
6419 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
6420 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
6421 }
6422 case ISD::STRICT_LROUND:
6424 case ISD::STRICT_LRINT:
6425 case ISD::STRICT_LLRINT: {
6426 assert(Op.getOperand(1).getValueType() == MVT::f16 &&
6427 "Expected custom lowering of rounding operations only for f16");
6428 SDLoc DL(Op);
6429 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6430 {Op.getOperand(0), Op.getOperand(1)});
6431 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6432 {Ext.getValue(1), Ext.getValue(0)});
6433 }
6434 case ISD::WRITE_REGISTER: {
6435 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6436 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6437 SDLoc DL(Op);
6438
6439 SDValue Chain = Op.getOperand(0);
6440 SDValue SysRegName = Op.getOperand(1);
6441 std::pair<SDValue, SDValue> Pair =
6442 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
6443
6444 // chain = MSRR(chain, sysregname, lo, hi)
6445 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6446 SysRegName, Pair.first, Pair.second);
6447
6448 return Result;
6449 }
6450 case ISD::FSHL:
6451 case ISD::FSHR:
6452 return LowerFunnelShift(Op, DAG);
6453 case ISD::FLDEXP:
6454 return LowerFLDEXP(Op, DAG);
6455 }
6456}
6457
6459 return !Subtarget->useSVEForFixedLengthVectors();
6460}
6461
6463 EVT VT, bool OverrideNEON) const {
6464 if (!VT.isFixedLengthVector() || !VT.isSimple())
6465 return false;
6466
6467 // Don't use SVE for vectors we cannot scalarize if required.
6468 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
6469 // Fixed length predicates should be promoted to i8.
6470 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
6471 case MVT::i1:
6472 default:
6473 return false;
6474 case MVT::i8:
6475 case MVT::i16:
6476 case MVT::i32:
6477 case MVT::i64:
6478 case MVT::f16:
6479 case MVT::f32:
6480 case MVT::f64:
6481 break;
6482 }
6483
6484 // NEON-sized vectors can be emulated using SVE instructions.
6485 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
6486 return Subtarget->hasSVEorSME();
6487
6488 // Ensure NEON MVTs only belong to a single register class.
6489 if (VT.getFixedSizeInBits() <= 128)
6490 return false;
6491
6492 // Ensure wider than NEON code generation is enabled.
6493 if (!Subtarget->useSVEForFixedLengthVectors())
6494 return false;
6495
6496 // Don't use SVE for types that don't fit.
6497 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
6498 return false;
6499
6500 // TODO: Perhaps an artificial restriction, but worth having whilst getting
6501 // the base fixed length SVE support in place.
6502 if (!VT.isPow2VectorType())
6503 return false;
6504
6505 return true;
6506}
6507
6508//===----------------------------------------------------------------------===//
6509// Calling Convention Implementation
6510//===----------------------------------------------------------------------===//
6511
6512static unsigned getIntrinsicID(const SDNode *N) {
6513 unsigned Opcode = N->getOpcode();
6514 switch (Opcode) {
6515 default:
6518 unsigned IID = N->getConstantOperandVal(0);
6519 if (IID < Intrinsic::num_intrinsics)
6520 return IID;
6522 }
6523 }
6524}
6525
6527 SDValue N1) const {
6528 if (!N0.hasOneUse())
6529 return false;
6530
6531 unsigned IID = getIntrinsicID(N1.getNode());
6532 // Avoid reassociating expressions that can be lowered to smlal/umlal.
6533 if (IID == Intrinsic::aarch64_neon_umull ||
6534 N1.getOpcode() == AArch64ISD::UMULL ||
6535 IID == Intrinsic::aarch64_neon_smull ||
6537 return N0.getOpcode() != ISD::ADD;
6538
6539 return true;
6540}
6541
6542/// Selects the correct CCAssignFn for a given CallingConvention value.
6544 bool IsVarArg) const {
6545 switch (CC) {
6546 default:
6547 report_fatal_error("Unsupported calling convention.");
6548 case CallingConv::GHC:
6549 return CC_AArch64_GHC;
6550 case CallingConv::C:
6551 case CallingConv::Fast:
6555 case CallingConv::Swift:
6557 case CallingConv::Tail:
6558 case CallingConv::GRAAL:
6559 if (Subtarget->isTargetWindows()) {
6560 if (IsVarArg) {
6561 if (Subtarget->isWindowsArm64EC())
6564 }
6565 return CC_AArch64_Win64PCS;
6566 }
6567 if (!Subtarget->isTargetDarwin())
6568 return CC_AArch64_AAPCS;
6569 if (!IsVarArg)
6570 return CC_AArch64_DarwinPCS;
6573 case CallingConv::Win64:
6574 if (IsVarArg) {
6575 if (Subtarget->isWindowsArm64EC())
6578 }
6579 return CC_AArch64_Win64PCS;
6581 if (Subtarget->isWindowsArm64EC())
6588 return CC_AArch64_AAPCS;
6593 }
6594}
6595
6596CCAssignFn *
6598 switch (CC) {
6599 default:
6600 return RetCC_AArch64_AAPCS;
6604 if (Subtarget->isWindowsArm64EC())
6606 return RetCC_AArch64_AAPCS;
6607 }
6608}
6609
6610
6611unsigned
6612AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
6613 SelectionDAG &DAG) const {
6615 MachineFrameInfo &MFI = MF.getFrameInfo();
6616
6617 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
6618 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6619 DAG.getConstant(1, DL, MVT::i32));
6620 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6621 SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
6622 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
6623 SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
6624 Chain = Buffer.getValue(1);
6625 MFI.CreateVariableSizedObject(Align(1), nullptr);
6626
6627 // Allocate an additional TPIDR2 object on the stack (16 bytes)
6628 unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
6629
6630 // Store the buffer pointer to the TPIDR2 stack object.
6633 TPIDR2Obj,
6635 Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
6636
6637 // Set the reserved bytes (10-15) to zero
6638 EVT PtrTy = Ptr.getValueType();
6639 SDValue ReservedPtr =
6640 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
6641 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
6642 MPI);
6643 ReservedPtr =
6644 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
6645 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
6646 MPI);
6647
6648 return TPIDR2Obj;
6649}
6650
6651SDValue AArch64TargetLowering::LowerFormalArguments(
6652 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6653 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6654 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6656 const Function &F = MF.getFunction();
6657 MachineFrameInfo &MFI = MF.getFrameInfo();
6658 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
6659 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
6660 (isVarArg && Subtarget->isWindowsArm64EC());
6662
6664 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
6666 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6667 FuncInfo->setIsSVECC(true);
6668
6669 // Assign locations to all of the incoming arguments.
6671 DenseMap<unsigned, SDValue> CopiedRegs;
6672 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6673
6674 // At this point, Ins[].VT may already be promoted to i32. To correctly
6675 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6676 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6677 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6678 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6679 // LocVT.
6680 unsigned NumArgs = Ins.size();
6681 Function::const_arg_iterator CurOrigArg = F.arg_begin();
6682 unsigned CurArgIdx = 0;
6683 for (unsigned i = 0; i != NumArgs; ++i) {
6684 MVT ValVT = Ins[i].VT;
6685 if (Ins[i].isOrigArg()) {
6686 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
6687 CurArgIdx = Ins[i].getOrigArgIndex();
6688
6689 // Get type of the original argument.
6690 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
6691 /*AllowUnknown*/ true);
6692 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
6693 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6694 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6695 ValVT = MVT::i8;
6696 else if (ActualMVT == MVT::i16)
6697 ValVT = MVT::i16;
6698 }
6699 bool UseVarArgCC = false;
6700 if (IsWin64)
6701 UseVarArgCC = isVarArg;
6702 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
6703 bool Res =
6704 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
6705 assert(!Res && "Call operand has unhandled type");
6706 (void)Res;
6707 }
6708
6710 bool IsLocallyStreaming =
6711 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
6712 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
6713 SDValue Glue = Chain.getValue(1);
6714
6715 SmallVector<SDValue, 16> ArgValues;
6716 unsigned ExtraArgLocs = 0;
6717 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
6718 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
6719
6720 if (Ins[i].Flags.isByVal()) {
6721 // Byval is used for HFAs in the PCS, but the system should work in a
6722 // non-compliant manner for larger structs.
6723 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6724 int Size = Ins[i].Flags.getByValSize();
6725 unsigned NumRegs = (Size + 7) / 8;
6726
6727 // FIXME: This works on big-endian for composite byvals, which are the common
6728 // case. It should also work for fundamental types too.
6729 unsigned FrameIdx =
6730 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
6731 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
6732 InVals.push_back(FrameIdxN);
6733
6734 continue;
6735 }
6736
6737 if (Ins[i].Flags.isSwiftAsync())
6739
6740 SDValue ArgValue;
6741 if (VA.isRegLoc()) {
6742 // Arguments stored in registers.
6743 EVT RegVT = VA.getLocVT();
6744 const TargetRegisterClass *RC;
6745
6746 if (RegVT == MVT::i32)
6747 RC = &AArch64::GPR32RegClass;
6748 else if (RegVT == MVT::i64)
6749 RC = &AArch64::GPR64RegClass;
6750 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
6751 RC = &AArch64::FPR16RegClass;
6752 else if (RegVT == MVT::f32)
6753 RC = &AArch64::FPR32RegClass;
6754 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
6755 RC = &AArch64::FPR64RegClass;
6756 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
6757 RC = &AArch64::FPR128RegClass;
6758 else if (RegVT.isScalableVector() &&
6759 RegVT.getVectorElementType() == MVT::i1) {
6760 FuncInfo->setIsSVECC(true);
6761 RC = &AArch64::PPRRegClass;
6762 } else if (RegVT == MVT::aarch64svcount) {
6763 FuncInfo->setIsSVECC(true);
6764 RC = &AArch64::PPRRegClass;
6765 } else if (RegVT.isScalableVector()) {
6766 FuncInfo->setIsSVECC(true);
6767 RC = &AArch64::ZPRRegClass;
6768 } else
6769 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
6770
6771 // Transform the arguments in physical registers into virtual ones.
6772 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
6773
6774 if (IsLocallyStreaming) {
6775 // LocallyStreamingFunctions must insert the SMSTART in the correct
6776 // position, so we use Glue to ensure no instructions can be scheduled
6777 // between the chain of:
6778 // t0: ch,glue = EntryNode
6779 // t1: res,ch,glue = CopyFromReg
6780 // ...
6781 // tn: res,ch,glue = CopyFromReg t(n-1), ..
6782 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
6783 // ^^^^^^
6784 // This will be the new Chain/Root node.
6785 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
6786 Glue = ArgValue.getValue(2);
6787 } else
6788 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
6789
6790 // If this is an 8, 16 or 32-bit value, it is really passed promoted
6791 // to 64 bits. Insert an assert[sz]ext to capture this, then
6792 // truncate to the right size.
6793 switch (VA.getLocInfo()) {
6794 default:
6795 llvm_unreachable("Unknown loc info!");
6796 case CCValAssign::Full:
6797 break;
6799 assert(
6800 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
6801 "Indirect arguments should be scalable on most subtargets");
6802 break;
6803 case CCValAssign::BCvt:
6804 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
6805 break;
6806 case CCValAssign::AExt:
6807 case CCValAssign::SExt:
6808 case CCValAssign::ZExt:
6809 break;
6811 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
6812 DAG.getConstant(32, DL, RegVT));
6813 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
6814 break;
6815 }
6816 } else { // VA.isRegLoc()
6817 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
6818 unsigned ArgOffset = VA.getLocMemOffset();
6819 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
6820 ? VA.getLocVT().getSizeInBits()
6821 : VA.getValVT().getSizeInBits()) / 8;
6822
6823 uint32_t BEAlign = 0;
6824 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
6825 !Ins[i].Flags.isInConsecutiveRegs())
6826 BEAlign = 8 - ArgSize;
6827
6828 SDValue FIN;
6829 MachinePointerInfo PtrInfo;
6830 if (StackViaX4) {
6831 // In both the ARM64EC varargs convention and the thunk convention,
6832 // arguments on the stack are accessed relative to x4, not sp. In
6833 // the thunk convention, there's an additional offset of 32 bytes
6834 // to account for the shadow store.
6835 unsigned ObjOffset = ArgOffset + BEAlign;
6836 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
6837 ObjOffset += 32;
6838 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
6839 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
6840 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
6841 DAG.getConstant(ObjOffset, DL, MVT::i64));
6843 } else {
6844 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
6845
6846 // Create load nodes to retrieve arguments from the stack.
6847 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
6848 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
6849 }
6850
6851 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
6853 MVT MemVT = VA.getValVT();
6854
6855 switch (VA.getLocInfo()) {
6856 default:
6857 break;
6858 case CCValAssign::Trunc:
6859 case CCValAssign::BCvt:
6860 MemVT = VA.getLocVT();
6861 break;
6864 Subtarget->isWindowsArm64EC()) &&
6865 "Indirect arguments should be scalable on most subtargets");
6866 MemVT = VA.getLocVT();
6867 break;
6868 case CCValAssign::SExt:
6869 ExtType = ISD::SEXTLOAD;
6870 break;
6871 case CCValAssign::ZExt:
6872 ExtType = ISD::ZEXTLOAD;
6873 break;
6874 case CCValAssign::AExt:
6875 ExtType = ISD::EXTLOAD;
6876 break;
6877 }
6878
6879 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
6880 MemVT);
6881 }
6882
6883 if (VA.getLocInfo() == CCValAssign::Indirect) {
6884 assert((VA.getValVT().isScalableVT() ||
6885 Subtarget->isWindowsArm64EC()) &&
6886 "Indirect arguments should be scalable on most subtargets");
6887
6888 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
6889 unsigned NumParts = 1;
6890 if (Ins[i].Flags.isInConsecutiveRegs()) {
6891 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
6892 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
6893 ++NumParts;
6894 }
6895
6896 MVT PartLoad = VA.getValVT();
6897 SDValue Ptr = ArgValue;
6898
6899 // Ensure we generate all loads for each tuple part, whilst updating the
6900 // pointer after each load correctly using vscale.
6901 while (NumParts > 0) {
6902 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
6903 InVals.push_back(ArgValue);
6904 NumParts--;
6905 if (NumParts > 0) {
6906 SDValue BytesIncrement;
6907 if (PartLoad.isScalableVector()) {
6908 BytesIncrement = DAG.getVScale(
6909 DL, Ptr.getValueType(),
6910 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
6911 } else {
6912 BytesIncrement = DAG.getConstant(
6913 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
6914 Ptr.getValueType());
6915 }
6917 Flags.setNoUnsignedWrap(true);
6918 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6919 BytesIncrement, Flags);
6920 ExtraArgLocs++;
6921 i++;
6922 }
6923 }
6924 } else {
6925 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
6926 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
6927 ArgValue, DAG.getValueType(MVT::i32));
6928
6929 // i1 arguments are zero-extended to i8 by the caller. Emit a
6930 // hint to reflect this.
6931 if (Ins[i].isOrigArg()) {
6932 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
6933 if (OrigArg->getType()->isIntegerTy(1)) {
6934 if (!Ins[i].Flags.isZExt()) {
6935 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
6936 ArgValue.getValueType(), ArgValue);
6937 }
6938 }
6939 }
6940
6941 InVals.push_back(ArgValue);
6942 }
6943 }
6944 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
6945
6946 // Insert the SMSTART if this is a locally streaming function and
6947 // make sure it is Glued to the last CopyFromReg value.
6948 if (IsLocallyStreaming) {
6949 SDValue PStateSM;
6950 if (Attrs.hasStreamingCompatibleInterface()) {
6951 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
6954 FuncInfo->setPStateSMReg(Reg);
6955 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
6956 } else {
6957 PStateSM = DAG.getConstant(0, DL, MVT::i64);
6958 }
6959 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue, PStateSM,
6960 /*Entry*/ true);
6961
6962 // Ensure that the SMSTART happens after the CopyWithChain such that its
6963 // chain result is used.
6964 for (unsigned I=0; I<InVals.size(); ++I) {
6966 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
6967 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
6968 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
6969 InVals[I].getValueType());
6970 }
6971 }
6972
6973 // varargs
6974 if (isVarArg) {
6975 if (!Subtarget->isTargetDarwin() || IsWin64) {
6976 // The AAPCS variadic function ABI is identical to the non-variadic
6977 // one. As a result there may be more arguments in registers and we should
6978 // save them for future reference.
6979 // Win64 variadic functions also pass arguments in registers, but all float
6980 // arguments are passed in integer registers.
6981 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
6982 }
6983
6984 // This will point to the next argument passed via stack.
6985 unsigned VarArgsOffset = CCInfo.getStackSize();
6986 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
6987 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
6988 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
6989 FuncInfo->setVarArgsStackIndex(
6990 MFI.CreateFixedObject(4, VarArgsOffset, true));
6991
6992 if (MFI.hasMustTailInVarArgFunc()) {
6993 SmallVector<MVT, 2> RegParmTypes;
6994 RegParmTypes.push_back(MVT::i64);
6995 RegParmTypes.push_back(MVT::f128);
6996 // Compute the set of forwarded registers. The rest are scratch.
6998 FuncInfo->getForwardedMustTailRegParms();
6999 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7001
7002 // Conservatively forward X8, since it might be used for aggregate return.
7003 if (!CCInfo.isAllocated(AArch64::X8)) {
7004 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7005 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7006 }
7007 }
7008 }
7009
7010 // On Windows, InReg pointers must be returned, so record the pointer in a
7011 // virtual register at the start of the function so it can be returned in the
7012 // epilogue.
7013 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7014 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7015 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7016 Ins[I].Flags.isInReg()) &&
7017 Ins[I].Flags.isSRet()) {
7018 assert(!FuncInfo->getSRetReturnReg());
7019
7020 MVT PtrTy = getPointerTy(DAG.getDataLayout());
7021 Register Reg =
7023 FuncInfo->setSRetReturnReg(Reg);
7024
7025 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
7026 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7027 break;
7028 }
7029 }
7030 }
7031
7032 unsigned StackArgSize = CCInfo.getStackSize();
7033 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7034 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
7035 // This is a non-standard ABI so by fiat I say we're allowed to make full
7036 // use of the stack area to be popped, which must be aligned to 16 bytes in
7037 // any case:
7038 StackArgSize = alignTo(StackArgSize, 16);
7039
7040 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7041 // a multiple of 16.
7042 FuncInfo->setArgumentStackToRestore(StackArgSize);
7043
7044 // This realignment carries over to the available bytes below. Our own
7045 // callers will guarantee the space is free by giving an aligned value to
7046 // CALLSEQ_START.
7047 }
7048 // Even if we're not expected to free up the space, it's useful to know how
7049 // much is there while considering tail calls (because we can reuse it).
7050 FuncInfo->setBytesInStackArgArea(StackArgSize);
7051
7052 if (Subtarget->hasCustomCallingConv())
7054
7055 // Conservatively assume the function requires the lazy-save mechanism.
7056 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7057 unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
7058 FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
7059 }
7060
7061 return Chain;
7062}
7063
7064void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7065 SelectionDAG &DAG,
7066 const SDLoc &DL,
7067 SDValue &Chain) const {
7069 MachineFrameInfo &MFI = MF.getFrameInfo();
7071 auto PtrVT = getPointerTy(DAG.getDataLayout());
7072 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
7073
7075
7077 unsigned NumGPRArgRegs = GPRArgRegs.size();
7078 if (Subtarget->isWindowsArm64EC()) {
7079 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7080 // functions.
7081 NumGPRArgRegs = 4;
7082 }
7083 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
7084
7085 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7086 int GPRIdx = 0;
7087 if (GPRSaveSize != 0) {
7088 if (IsWin64) {
7089 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
7090 if (GPRSaveSize & 15)
7091 // The extra size here, if triggered, will always be 8.
7092 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
7093 } else
7094 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
7095
7096 SDValue FIN;
7097 if (Subtarget->isWindowsArm64EC()) {
7098 // With the Arm64EC ABI, we reserve the save area as usual, but we
7099 // compute its address relative to x4. For a normal AArch64->AArch64
7100 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7101 // different address.
7102 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7103 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7104 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7105 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7106 } else {
7107 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
7108 }
7109
7110 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7111 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7112 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7113 SDValue Store =
7114 DAG.getStore(Val.getValue(1), DL, Val, FIN,
7116 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
7117 : MachinePointerInfo::getStack(MF, i * 8));
7118 MemOps.push_back(Store);
7119 FIN =
7120 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
7121 }
7122 }
7123 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7124 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7125
7126 if (Subtarget->hasFPARMv8() && !IsWin64) {
7128 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7129 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
7130
7131 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7132 int FPRIdx = 0;
7133 if (FPRSaveSize != 0) {
7134 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
7135
7136 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
7137
7138 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7139 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7140 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7141
7142 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
7143 MachinePointerInfo::getStack(MF, i * 16));
7144 MemOps.push_back(Store);
7145 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
7146 DAG.getConstant(16, DL, PtrVT));
7147 }
7148 }
7149 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7150 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7151 }
7152
7153 if (!MemOps.empty()) {
7154 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7155 }
7156}
7157
7158static bool isPassedInFPR(EVT VT) {
7159 return VT.isFixedLengthVector() ||
7160 (VT.isFloatingPoint() && !VT.isScalableVector());
7161}
7162
7163/// LowerCallResult - Lower the result values of a call into the
7164/// appropriate copies out of appropriate physical registers.
7165SDValue AArch64TargetLowering::LowerCallResult(
7166 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7167 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7168 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7169 SDValue ThisVal, bool RequiresSMChange) const {
7170 DenseMap<unsigned, SDValue> CopiedRegs;
7171 // Copy all of the result registers out of their specified physreg.
7172 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7173 CCValAssign VA = RVLocs[i];
7174
7175 // Pass 'this' value directly from the argument to return value, to avoid
7176 // reg unit interference
7177 if (i == 0 && isThisReturn) {
7178 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7179 "unexpected return calling convention register assignment");
7180 InVals.push_back(ThisVal);
7181 continue;
7182 }
7183
7184 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7185 // allows one use of a physreg per block.
7186 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
7187 if (!Val) {
7188 Val =
7189 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
7190 Chain = Val.getValue(1);
7191 InGlue = Val.getValue(2);
7192 CopiedRegs[VA.getLocReg()] = Val;
7193 }
7194
7195 switch (VA.getLocInfo()) {
7196 default:
7197 llvm_unreachable("Unknown loc info!");
7198 case CCValAssign::Full:
7199 break;
7200 case CCValAssign::BCvt:
7201 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
7202 break;
7204 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
7205 DAG.getConstant(32, DL, VA.getLocVT()));
7206 [[fallthrough]];
7207 case CCValAssign::AExt:
7208 [[fallthrough]];
7209 case CCValAssign::ZExt:
7210 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
7211 break;
7212 }
7213
7214 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
7216 Val);
7217
7218 InVals.push_back(Val);
7219 }
7220
7221 return Chain;
7222}
7223
7224/// Return true if the calling convention is one that we can guarantee TCO for.
7225static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7226 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7228}
7229
7230/// Return true if we might ever do TCO for calls with this calling convention.
7232 switch (CC) {
7233 case CallingConv::C:
7237 case CallingConv::Swift:
7239 case CallingConv::Tail:
7240 case CallingConv::Fast:
7241 return true;
7242 default:
7243 return false;
7244 }
7245}
7246
7248 const AArch64Subtarget *Subtarget,
7250 CCState &CCInfo) {
7251 const SelectionDAG &DAG = CLI.DAG;
7252 CallingConv::ID CalleeCC = CLI.CallConv;
7253 bool IsVarArg = CLI.IsVarArg;
7254 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7255 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
7256
7257 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7258 // for the shadow store.
7259 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7260 CCInfo.AllocateStack(32, Align(16));
7261
7262 unsigned NumArgs = Outs.size();
7263 for (unsigned i = 0; i != NumArgs; ++i) {
7264 MVT ArgVT = Outs[i].VT;
7265 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7266
7267 bool UseVarArgCC = false;
7268 if (IsVarArg) {
7269 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7270 // too, so use the vararg CC to force them to integer registers.
7271 if (IsCalleeWin64) {
7272 UseVarArgCC = true;
7273 } else {
7274 UseVarArgCC = !Outs[i].IsFixed;
7275 }
7276 }
7277
7278 if (!UseVarArgCC) {
7279 // Get type of the original argument.
7280 EVT ActualVT =
7281 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
7282 /*AllowUnknown*/ true);
7283 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7284 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7285 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7286 ArgVT = MVT::i8;
7287 else if (ActualMVT == MVT::i16)
7288 ArgVT = MVT::i16;
7289 }
7290
7291 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
7292 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7293 assert(!Res && "Call operand has unhandled type");
7294 (void)Res;
7295 }
7296}
7297
7298bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7299 const CallLoweringInfo &CLI) const {
7300 CallingConv::ID CalleeCC = CLI.CallConv;
7301 if (!mayTailCallThisCC(CalleeCC))
7302 return false;
7303
7304 SDValue Callee = CLI.Callee;
7305 bool IsVarArg = CLI.IsVarArg;
7306 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7307 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7308 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7309 const SelectionDAG &DAG = CLI.DAG;
7311 const Function &CallerF = MF.getFunction();
7312 CallingConv::ID CallerCC = CallerF.getCallingConv();
7313
7314 // SME Streaming functions are not eligible for TCO as they may require
7315 // the streaming mode or ZA to be restored after returning from the call.
7316 SMEAttrs CallerAttrs(MF.getFunction());
7317 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7318 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
7319 CallerAttrs.requiresLazySave(CalleeAttrs) ||
7320 CallerAttrs.hasStreamingBody())
7321 return false;
7322
7323 // Functions using the C or Fast calling convention that have an SVE signature
7324 // preserve more registers and should assume the SVE_VectorCall CC.
7325 // The check for matching callee-saved regs will determine whether it is
7326 // eligible for TCO.
7327 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7330
7331 bool CCMatch = CallerCC == CalleeCC;
7332
7333 // When using the Windows calling convention on a non-windows OS, we want
7334 // to back up and restore X18 in such functions; we can't do a tail call
7335 // from those functions.
7336 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7337 CalleeCC != CallingConv::Win64)
7338 return false;
7339
7340 // Byval parameters hand the function a pointer directly into the stack area
7341 // we want to reuse during a tail call. Working around this *is* possible (see
7342 // X86) but less efficient and uglier in LowerCall.
7343 for (Function::const_arg_iterator i = CallerF.arg_begin(),
7344 e = CallerF.arg_end();
7345 i != e; ++i) {
7346 if (i->hasByValAttr())
7347 return false;
7348
7349 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
7350 // In this case, it is necessary to save/restore X0 in the callee. Tail
7351 // call opt interferes with this. So we disable tail call opt when the
7352 // caller has an argument with "inreg" attribute.
7353
7354 // FIXME: Check whether the callee also has an "inreg" argument.
7355 if (i->hasInRegAttr())
7356 return false;
7357 }
7358
7359 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
7360 return CCMatch;
7361
7362 // Externally-defined functions with weak linkage should not be
7363 // tail-called on AArch64 when the OS does not support dynamic
7364 // pre-emption of symbols, as the AAELF spec requires normal calls
7365 // to undefined weak functions to be replaced with a NOP or jump to the
7366 // next instruction. The behaviour of branch instructions in this
7367 // situation (as used for tail calls) is implementation-defined, so we
7368 // cannot rely on the linker replacing the tail call with a return.
7369 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7370 const GlobalValue *GV = G->getGlobal();
7372 if (GV->hasExternalWeakLinkage() &&
7373 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
7374 return false;
7375 }
7376
7377 // Now we search for cases where we can use a tail call without changing the
7378 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
7379 // concept.
7380
7381 // I want anyone implementing a new calling convention to think long and hard
7382 // about this assert.
7383 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
7384 "Unexpected variadic calling convention");
7385
7386 LLVMContext &C = *DAG.getContext();
7387 // Check that the call results are passed in the same way.
7388 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7389 CCAssignFnForCall(CalleeCC, IsVarArg),
7390 CCAssignFnForCall(CallerCC, IsVarArg)))
7391 return false;
7392 // The callee has to preserve all registers the caller needs to preserve.
7393 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7394 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7395 if (!CCMatch) {
7396 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7397 if (Subtarget->hasCustomCallingConv()) {
7398 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
7399 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
7400 }
7401 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7402 return false;
7403 }
7404
7405 // Nothing more to check if the callee is taking no arguments
7406 if (Outs.empty())
7407 return true;
7408
7410 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7411
7412 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7413
7414 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7415 // When we are musttail, additional checks have been done and we can safely ignore this check
7416 // At least two cases here: if caller is fastcc then we can't have any
7417 // memory arguments (we'd be expected to clean up the stack afterwards). If
7418 // caller is C then we could potentially use its argument area.
7419
7420 // FIXME: for now we take the most conservative of these in both cases:
7421 // disallow all variadic memory operands.
7422 for (const CCValAssign &ArgLoc : ArgLocs)
7423 if (!ArgLoc.isRegLoc())
7424 return false;
7425 }
7426
7427 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7428
7429 // If any of the arguments is passed indirectly, it must be SVE, so the
7430 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7431 // allocate space on the stack. That is why we determine this explicitly here
7432 // the call cannot be a tailcall.
7433 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
7434 assert((A.getLocInfo() != CCValAssign::Indirect ||
7435 A.getValVT().isScalableVector() ||
7436 Subtarget->isWindowsArm64EC()) &&
7437 "Expected value to be scalable");
7438 return A.getLocInfo() == CCValAssign::Indirect;
7439 }))
7440 return false;
7441
7442 // If the stack arguments for this call do not fit into our own save area then
7443 // the call cannot be made tail.
7444 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7445 return false;
7446
7447 const MachineRegisterInfo &MRI = MF.getRegInfo();
7448 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
7449 return false;
7450
7451 return true;
7452}
7453
7454SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7455 SelectionDAG &DAG,
7456 MachineFrameInfo &MFI,
7457 int ClobberedFI) const {
7458 SmallVector<SDValue, 8> ArgChains;
7459 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
7460 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
7461
7462 // Include the original chain at the beginning of the list. When this is
7463 // used by target LowerCall hooks, this helps legalize find the
7464 // CALLSEQ_BEGIN node.
7465 ArgChains.push_back(Chain);
7466
7467 // Add a chain value for each stack argument corresponding
7468 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
7469 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
7470 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
7471 if (FI->getIndex() < 0) {
7472 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
7473 int64_t InLastByte = InFirstByte;
7474 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
7475
7476 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
7477 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
7478 ArgChains.push_back(SDValue(L, 1));
7479 }
7480
7481 // Build a tokenfactor for all the chains.
7482 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
7483}
7484
7485bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
7486 bool TailCallOpt) const {
7487 return (CallCC == CallingConv::Fast && TailCallOpt) ||
7488 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
7489}
7490
7491// Check if the value is zero-extended from i1 to i8
7492static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
7493 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
7494 if (SizeInBits < 8)
7495 return false;
7496
7497 APInt RequredZero(SizeInBits, 0xFE);
7498 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
7499 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
7500 return ZExtBool;
7501}
7502
7503void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7504 SDNode *Node) const {
7505 // Live-in physreg copies that are glued to SMSTART are applied as
7506 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
7507 // register allocator to pass call args in callee saved regs, without extra
7508 // copies to avoid these fake clobbers of actually-preserved GPRs.
7509 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
7510 MI.getOpcode() == AArch64::MSRpstatePseudo)
7511 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
7512 if (MachineOperand &MO = MI.getOperand(I);
7513 MO.isReg() && MO.isImplicit() && MO.isDef() &&
7514 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
7515 AArch64::GPR64RegClass.contains(MO.getReg())))
7516 MI.removeOperand(I);
7517}
7518
7520 SelectionDAG &DAG, SDLoc DL, bool Enable,
7521 SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const {
7524 FuncInfo->setHasStreamingModeChanges(true);
7525
7526 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7527 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
7528 SDValue MSROp =
7529 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
7530
7531 SDValue ExpectedSMVal =
7532 DAG.getTargetConstant(Entry ? Enable : !Enable, DL, MVT::i64);
7533 SmallVector<SDValue> Ops = {Chain, MSROp, PStateSM, ExpectedSMVal, RegMask};
7534
7535 if (InGlue)
7536 Ops.push_back(InGlue);
7537
7538 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
7539 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
7540}
7541
7542/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
7543/// and add input and output parameter nodes.
7544SDValue
7545AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
7546 SmallVectorImpl<SDValue> &InVals) const {
7547 SelectionDAG &DAG = CLI.DAG;
7548 SDLoc &DL = CLI.DL;
7549 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7550 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7551 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7552 SDValue Chain = CLI.Chain;
7553 SDValue Callee = CLI.Callee;
7554 bool &IsTailCall = CLI.IsTailCall;
7555 CallingConv::ID &CallConv = CLI.CallConv;
7556 bool IsVarArg = CLI.IsVarArg;
7557
7560 bool IsThisReturn = false;
7561
7563 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7564 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
7565 bool IsSibCall = false;
7566 bool GuardWithBTI = false;
7567
7568 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
7569 !Subtarget->noBTIAtReturnTwice()) {
7570 GuardWithBTI = FuncInfo->branchTargetEnforcement();
7571 }
7572
7573 // Analyze operands of the call, assigning locations to each operand.
7575 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
7576
7577 if (IsVarArg) {
7578 unsigned NumArgs = Outs.size();
7579
7580 for (unsigned i = 0; i != NumArgs; ++i) {
7581 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
7582 report_fatal_error("Passing SVE types to variadic functions is "
7583 "currently not supported");
7584 }
7585 }
7586
7587 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7588
7589 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7590 // Assign locations to each value returned by this call.
7592 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
7593 *DAG.getContext());
7594 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
7595
7596 // Check callee args/returns for SVE registers and set calling convention
7597 // accordingly.
7598 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
7599 auto HasSVERegLoc = [](CCValAssign &Loc) {
7600 if (!Loc.isRegLoc())
7601 return false;
7602 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
7603 AArch64::PPRRegClass.contains(Loc.getLocReg());
7604 };
7605 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
7607 }
7608
7609 if (IsTailCall) {
7610 // Check if it's really possible to do a tail call.
7611 IsTailCall = isEligibleForTailCallOptimization(CLI);
7612
7613 // A sibling call is one where we're under the usual C ABI and not planning
7614 // to change that but can still do a tail call:
7615 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
7616 CallConv != CallingConv::SwiftTail)
7617 IsSibCall = true;
7618
7619 if (IsTailCall)
7620 ++NumTailCalls;
7621 }
7622
7623 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
7624 report_fatal_error("failed to perform tail call elimination on a call "
7625 "site marked musttail");
7626
7627 // Get a count of how many bytes are to be pushed on the stack.
7628 unsigned NumBytes = CCInfo.getStackSize();
7629
7630 if (IsSibCall) {
7631 // Since we're not changing the ABI to make this a tail call, the memory
7632 // operands are already available in the caller's incoming argument space.
7633 NumBytes = 0;
7634 }
7635
7636 // FPDiff is the byte offset of the call's argument area from the callee's.
7637 // Stores to callee stack arguments will be placed in FixedStackSlots offset
7638 // by this amount for a tail call. In a sibling call it must be 0 because the
7639 // caller will deallocate the entire stack and the callee still expects its
7640 // arguments to begin at SP+0. Completely unused for non-tail calls.
7641 int FPDiff = 0;
7642
7643 if (IsTailCall && !IsSibCall) {
7644 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
7645
7646 // Since callee will pop argument stack as a tail call, we must keep the
7647 // popped size 16-byte aligned.
7648 NumBytes = alignTo(NumBytes, 16);
7649
7650 // FPDiff will be negative if this tail call requires more space than we
7651 // would automatically have in our incoming argument space. Positive if we
7652 // can actually shrink the stack.
7653 FPDiff = NumReusableBytes - NumBytes;
7654
7655 // Update the required reserved area if this is the tail call requiring the
7656 // most argument stack space.
7657 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
7658 FuncInfo->setTailCallReservedStack(-FPDiff);
7659
7660 // The stack pointer must be 16-byte aligned at all times it's used for a
7661 // memory operation, which in practice means at *all* times and in
7662 // particular across call boundaries. Therefore our own arguments started at
7663 // a 16-byte aligned SP and the delta applied for the tail call should
7664 // satisfy the same constraint.
7665 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
7666 }
7667
7668 // Determine whether we need any streaming mode changes.
7669 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
7670 if (CLI.CB)
7671 CalleeAttrs = SMEAttrs(*CLI.CB);
7672 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7673 CalleeAttrs = SMEAttrs(ES->getSymbol());
7674
7675 auto DescribeCallsite =
7677 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
7678 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7679 R << ore::NV("Callee", ES->getSymbol());
7680 else if (CLI.CB && CLI.CB->getCalledFunction())
7681 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
7682 else
7683 R << "unknown callee";
7684 R << "'";
7685 return R;
7686 };
7687
7688 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
7689 if (RequiresLazySave) {
7690 unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
7692 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
7694 SDValue NumZaSaveSlicesAddr =
7695 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
7696 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
7697 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7698 DAG.getConstant(1, DL, MVT::i32));
7699 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
7700 MPI, MVT::i16);
7701 Chain = DAG.getNode(
7702 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
7703 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
7704 TPIDR2ObjAddr);
7706 ORE.emit([&]() {
7707 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
7708 CLI.CB)
7709 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
7710 &MF.getFunction());
7711 return DescribeCallsite(R) << " sets up a lazy save for ZA";
7712 });
7713 }
7714
7715 SDValue PStateSM;
7716 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
7717 if (RequiresSMChange) {
7718 if (CallerAttrs.hasStreamingInterfaceOrBody())
7719 PStateSM = DAG.getConstant(1, DL, MVT::i64);
7720 else if (CallerAttrs.hasNonStreamingInterface())
7721 PStateSM = DAG.getConstant(0, DL, MVT::i64);
7722 else
7723 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7725 ORE.emit([&]() {
7726 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
7727 CLI.CB)
7728 : OptimizationRemarkAnalysis("sme", "SMETransition",
7729 &MF.getFunction());
7730 DescribeCallsite(R) << " requires a streaming mode transition";
7731 return R;
7732 });
7733 }
7734
7735 SDValue ZTFrameIdx;
7736 MachineFrameInfo &MFI = MF.getFrameInfo();
7737 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
7738
7739 // If the caller has ZT0 state which will not be preserved by the callee,
7740 // spill ZT0 before the call.
7741 if (ShouldPreserveZT0) {
7742 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
7743 ZTFrameIdx = DAG.getFrameIndex(
7744 ZTObj,
7746
7747 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
7748 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
7749 }
7750
7751 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
7752 // PSTATE.ZA before the call if there is no lazy-save active.
7753 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
7754 assert((!DisableZA || !RequiresLazySave) &&
7755 "Lazy-save should have PSTATE.SM=1 on entry to the function");
7756
7757 if (DisableZA)
7758 Chain = DAG.getNode(
7759 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
7760 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
7761 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
7762
7763 // Adjust the stack pointer for the new arguments...
7764 // These operations are automatically eliminated by the prolog/epilog pass
7765 if (!IsSibCall)
7766 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
7767
7768 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
7770
7772 SmallSet<unsigned, 8> RegsUsed;
7773 SmallVector<SDValue, 8> MemOpChains;
7774 auto PtrVT = getPointerTy(DAG.getDataLayout());
7775
7776 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
7777 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
7778 for (const auto &F : Forwards) {
7779 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
7780 RegsToPass.emplace_back(F.PReg, Val);
7781 }
7782 }
7783
7784 // Walk the register/memloc assignments, inserting copies/loads.
7785 unsigned ExtraArgLocs = 0;
7786 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
7787 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7788 SDValue Arg = OutVals[i];
7789 ISD::ArgFlagsTy Flags = Outs[i].Flags;
7790
7791 // Promote the value if needed.
7792 switch (VA.getLocInfo()) {
7793 default:
7794 llvm_unreachable("Unknown loc info!");
7795 case CCValAssign::Full:
7796 break;
7797 case CCValAssign::SExt:
7798 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
7799 break;
7800 case CCValAssign::ZExt:
7801 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
7802 break;
7803 case CCValAssign::AExt:
7804 if (Outs[i].ArgVT == MVT::i1) {
7805 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
7806 //
7807 // Check if we actually have to do this, because the value may
7808 // already be zero-extended.
7809 //
7810 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
7811 // and rely on DAGCombiner to fold this, because the following
7812 // (anyext i32) is combined with (zext i8) in DAG.getNode:
7813 //
7814 // (ext (zext x)) -> (zext x)
7815 //
7816 // This will give us (zext i32), which we cannot remove, so
7817 // try to check this beforehand.
7818 if (!checkZExtBool(Arg, DAG)) {
7819 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
7820 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
7821 }
7822 }
7823 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
7824 break;
7826 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
7827 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
7828 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
7829 DAG.getConstant(32, DL, VA.getLocVT()));
7830 break;
7831 case CCValAssign::BCvt:
7832 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
7833 break;
7834 case CCValAssign::Trunc:
7835 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
7836 break;
7837 case CCValAssign::FPExt:
7838 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
7839 break;
7841 bool isScalable = VA.getValVT().isScalableVT();
7842 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
7843 "Indirect arguments should be scalable on most subtargets");
7844
7845 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
7846 uint64_t PartSize = StoreSize;
7847 unsigned NumParts = 1;
7848 if (Outs[i].Flags.isInConsecutiveRegs()) {
7849 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
7850 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7851 ++NumParts;
7852 StoreSize *= NumParts;
7853 }
7854
7855 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
7856 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
7857 MachineFrameInfo &MFI = MF.getFrameInfo();
7858 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
7859 if (isScalable)
7861
7865 SDValue SpillSlot = Ptr;
7866
7867 // Ensure we generate all stores for each tuple part, whilst updating the
7868 // pointer after each store correctly using vscale.
7869 while (NumParts) {
7870 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
7871 MemOpChains.push_back(Store);
7872
7873 NumParts--;
7874 if (NumParts > 0) {
7875 SDValue BytesIncrement;
7876 if (isScalable) {
7877 BytesIncrement = DAG.getVScale(
7878 DL, Ptr.getValueType(),
7879 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7880 } else {
7881 BytesIncrement = DAG.getConstant(
7882 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7883 Ptr.getValueType());
7884 }
7886 Flags.setNoUnsignedWrap(true);
7887
7888 MPI = MachinePointerInfo(MPI.getAddrSpace());
7889 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7890 BytesIncrement, Flags);
7891 ExtraArgLocs++;
7892 i++;
7893 }
7894 }
7895
7896 Arg = SpillSlot;
7897 break;
7898 }
7899
7900 if (VA.isRegLoc()) {
7901 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
7902 Outs[0].VT == MVT::i64) {
7903 assert(VA.getLocVT() == MVT::i64 &&
7904 "unexpected calling convention register assignment");
7905 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
7906 "unexpected use of 'returned'");
7907 IsThisReturn = true;
7908 }
7909 if (RegsUsed.count(VA.getLocReg())) {
7910 // If this register has already been used then we're trying to pack
7911 // parts of an [N x i32] into an X-register. The extension type will
7912 // take care of putting the two halves in the right place but we have to
7913 // combine them.
7914 SDValue &Bits =
7915 llvm::find_if(RegsToPass,
7916 [=](const std::pair<unsigned, SDValue> &Elt) {
7917 return Elt.first == VA.getLocReg();
7918 })
7919 ->second;
7920 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
7921 // Call site info is used for function's parameter entry value
7922 // tracking. For now we track only simple cases when parameter
7923 // is transferred through whole register.
7924 llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
7925 return ArgReg.Reg == VA.getLocReg();
7926 });
7927 } else {
7928 // Add an extra level of indirection for streaming mode changes by
7929 // using a pseudo copy node that cannot be rematerialised between a
7930 // smstart/smstop and the call by the simple register coalescer.
7931 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
7933 Arg.getValueType(), Arg);
7934 RegsToPass.emplace_back(VA.getLocReg(), Arg);
7935 RegsUsed.insert(VA.getLocReg());
7936 const TargetOptions &Options = DAG.getTarget().Options;
7937 if (Options.EmitCallSiteInfo)
7938 CSInfo.emplace_back(VA.getLocReg(), i);
7939 }
7940 } else {
7941 assert(VA.isMemLoc());
7942
7943 SDValue DstAddr;
7944 MachinePointerInfo DstInfo;
7945
7946 // FIXME: This works on big-endian for composite byvals, which are the
7947 // common case. It should also work for fundamental types too.
7948 uint32_t BEAlign = 0;
7949 unsigned OpSize;
7950 if (VA.getLocInfo() == CCValAssign::Indirect ||
7952 OpSize = VA.getLocVT().getFixedSizeInBits();
7953 else
7954 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
7955 : VA.getValVT().getSizeInBits();
7956 OpSize = (OpSize + 7) / 8;
7957 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
7958 !Flags.isInConsecutiveRegs()) {
7959 if (OpSize < 8)
7960 BEAlign = 8 - OpSize;
7961 }
7962 unsigned LocMemOffset = VA.getLocMemOffset();
7963 int32_t Offset = LocMemOffset + BEAlign;
7964 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
7965 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
7966
7967 if (IsTailCall) {
7968 Offset = Offset + FPDiff;
7969 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
7970
7971 DstAddr = DAG.getFrameIndex(FI, PtrVT);
7972 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
7973
7974 // Make sure any stack arguments overlapping with where we're storing
7975 // are loaded before this eventual operation. Otherwise they'll be
7976 // clobbered.
7977 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
7978 } else {
7979 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
7980
7981 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
7982 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
7983 }
7984
7985 if (Outs[i].Flags.isByVal()) {
7986 SDValue SizeNode =
7987 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
7988 SDValue Cpy = DAG.getMemcpy(
7989 Chain, DL, DstAddr, Arg, SizeNode,
7990 Outs[i].Flags.getNonZeroByValAlign(),
7991 /*isVol = */ false, /*AlwaysInline = */ false,
7992 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
7993
7994 MemOpChains.push_back(Cpy);
7995 } else {
7996 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
7997 // promoted to a legal register type i32, we should truncate Arg back to
7998 // i1/i8/i16.
7999 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8000 VA.getValVT() == MVT::i16)
8001 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
8002
8003 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
8004 MemOpChains.push_back(Store);
8005 }
8006 }
8007 }
8008
8009 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8010 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8011 // describing the argument list. x4 contains the address of the
8012 // first stack parameter. x5 contains the size in bytes of all parameters
8013 // passed on the stack.
8014 RegsToPass.emplace_back(AArch64::X4, StackPtr);
8015 RegsToPass.emplace_back(AArch64::X5,
8016 DAG.getConstant(NumBytes, DL, MVT::i64));
8017 }
8018
8019 if (!MemOpChains.empty())
8020 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8021
8022 SDValue InGlue;
8023 if (RequiresSMChange) {
8024 SDValue NewChain =
8025 changeStreamingMode(DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain,
8026 InGlue, PStateSM, true);
8027 Chain = NewChain.getValue(0);
8028 InGlue = NewChain.getValue(1);
8029 }
8030
8031 // Build a sequence of copy-to-reg nodes chained together with token chain
8032 // and flag operands which copy the outgoing args into the appropriate regs.
8033 for (auto &RegToPass : RegsToPass) {
8034 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
8035 RegToPass.second, InGlue);
8036 InGlue = Chain.getValue(1);
8037 }
8038
8039 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8040 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8041 // node so that legalize doesn't hack it.
8042 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8043 auto GV = G->getGlobal();
8044 unsigned OpFlags =
8046 if (OpFlags & AArch64II::MO_GOT) {
8047 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8048 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8049 } else {
8050 const GlobalValue *GV = G->getGlobal();
8051 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8052 }
8053 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
8055 Subtarget->isTargetMachO()) {
8056 const char *Sym = S->getSymbol();
8058 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8059 } else {
8060 const char *Sym = S->getSymbol();
8061 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
8062 }
8063 }
8064
8065 // We don't usually want to end the call-sequence here because we would tidy
8066 // the frame up *after* the call, however in the ABI-changing tail-call case
8067 // we've carefully laid out the parameters so that when sp is reset they'll be
8068 // in the correct location.
8069 if (IsTailCall && !IsSibCall) {
8070 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
8071 InGlue = Chain.getValue(1);
8072 }
8073
8074 std::vector<SDValue> Ops;
8075 Ops.push_back(Chain);
8076 Ops.push_back(Callee);
8077
8078 if (IsTailCall) {
8079 // Each tail call may have to adjust the stack by a different amount, so
8080 // this information must travel along with the operation for eventual
8081 // consumption by emitEpilogue.
8082 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8083 }
8084
8085 // Add argument registers to the end of the list so that they are known live
8086 // into the call.
8087 for (auto &RegToPass : RegsToPass)
8088 Ops.push_back(DAG.getRegister(RegToPass.first,
8089 RegToPass.second.getValueType()));
8090
8091 // Add a register mask operand representing the call-preserved registers.
8092 const uint32_t *Mask;
8093 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8094 if (IsThisReturn) {
8095 // For 'this' returns, use the X0-preserving mask if applicable
8096 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8097 if (!Mask) {
8098 IsThisReturn = false;
8099 Mask = TRI->getCallPreservedMask(MF, CallConv);
8100 }
8101 } else
8102 Mask = TRI->getCallPreservedMask(MF, CallConv);
8103
8104 if (Subtarget->hasCustomCallingConv())
8105 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
8106
8107 if (TRI->isAnyArgRegReserved(MF))
8108 TRI->emitReservedArgRegCallError(MF);
8109
8110 assert(Mask && "Missing call preserved mask for calling convention");
8111 Ops.push_back(DAG.getRegisterMask(Mask));
8112
8113 if (InGlue.getNode())
8114 Ops.push_back(InGlue);
8115
8116 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8117
8118 // If we're doing a tall call, use a TC_RETURN here rather than an
8119 // actual call instruction.
8120 if (IsTailCall) {
8122 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
8123
8124 if (IsCFICall)
8125 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8126
8127 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
8128 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
8129 return Ret;
8130 }
8131
8132 unsigned CallOpc = AArch64ISD::CALL;
8133 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8134 // be expanded to the call, directly followed by a special marker sequence and
8135 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8136 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
8137 assert(!IsTailCall &&
8138 "tail calls cannot be marked with clang.arc.attachedcall");
8139 CallOpc = AArch64ISD::CALL_RVMARKER;
8140
8141 // Add a target global address for the retainRV/claimRV runtime function
8142 // just before the call target.
8143 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
8144 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
8145 Ops.insert(Ops.begin() + 1, GA);
8146 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8148 } else if (GuardWithBTI) {
8149 CallOpc = AArch64ISD::CALL_BTI;
8150 }
8151
8152 // Returns a chain and a flag for retval copy to use.
8153 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
8154
8155 if (IsCFICall)
8156 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8157
8158 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
8159 InGlue = Chain.getValue(1);
8160 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
8161
8162 uint64_t CalleePopBytes =
8163 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
8164
8165 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
8166 InGlue = Chain.getValue(1);
8167
8168 // Handle result values, copying them out of physregs into vregs that we
8169 // return.
8170 SDValue Result = LowerCallResult(
8171 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
8172 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8173
8174 if (!Ins.empty())
8175 InGlue = Result.getValue(Result->getNumValues() - 1);
8176
8177 if (RequiresSMChange) {
8178 assert(PStateSM && "Expected a PStateSM to be set");
8179 Result = changeStreamingMode(DAG, DL, !CalleeAttrs.hasStreamingInterface(),
8180 Result, InGlue, PStateSM, false);
8181 }
8182
8183 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8184 // Unconditionally resume ZA.
8185 Result = DAG.getNode(
8186 AArch64ISD::SMSTART, DL, MVT::Other, Result,
8187 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8188 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
8189
8190 if (ShouldPreserveZT0)
8191 Result =
8192 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8193 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8194
8195 if (RequiresLazySave) {
8196 // Conditionally restore the lazy save using a pseudo node.
8197 unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
8198 SDValue RegMask = DAG.getRegisterMask(
8199 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8200 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8201 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
8202 SDValue TPIDR2_EL0 = DAG.getNode(
8203 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8204 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8205
8206 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8207 // RESTORE_ZA pseudo.
8208 SDValue Glue;
8209 SDValue TPIDR2Block = DAG.getFrameIndex(
8211 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8212 Result =
8213 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8214 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8215 RestoreRoutine, RegMask, Result.getValue(1)});
8216
8217 // Finally reset the TPIDR2_EL0 register to 0.
8218 Result = DAG.getNode(
8219 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8220 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8221 DAG.getConstant(0, DL, MVT::i64));
8222 }
8223
8224 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8225 for (unsigned I = 0; I < InVals.size(); ++I) {
8226 // The smstart/smstop is chained as part of the call, but when the
8227 // resulting chain is discarded (which happens when the call is not part
8228 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8229 // smstart/smstop is chained to the result value. We can do that by doing
8230 // a vreg -> vreg copy.
8232 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8233 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
8234 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
8235 InVals[I].getValueType());
8236 }
8237 }
8238
8239 return Result;
8240}
8241
8242bool AArch64TargetLowering::CanLowerReturn(
8243 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8244 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8245 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8247 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8248 return CCInfo.CheckReturn(Outs, RetCC);
8249}
8250
8251SDValue
8252AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8253 bool isVarArg,
8255 const SmallVectorImpl<SDValue> &OutVals,
8256 const SDLoc &DL, SelectionDAG &DAG) const {
8257 auto &MF = DAG.getMachineFunction();
8258 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8259
8260 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8262 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8263 CCInfo.AnalyzeReturn(Outs, RetCC);
8264
8265 // Copy the result values into the output registers.
8266 SDValue Glue;
8268 SmallSet<unsigned, 4> RegsUsed;
8269 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
8270 ++i, ++realRVLocIdx) {
8271 CCValAssign &VA = RVLocs[i];
8272 assert(VA.isRegLoc() && "Can only return in registers!");
8273 SDValue Arg = OutVals[realRVLocIdx];
8274
8275 switch (VA.getLocInfo()) {
8276 default:
8277 llvm_unreachable("Unknown loc info!");
8278 case CCValAssign::Full:
8279 if (Outs[i].ArgVT == MVT::i1) {
8280 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
8281 // value. This is strictly redundant on Darwin (which uses "zeroext
8282 // i1"), but will be optimised out before ISel.
8283 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8284 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8285 }
8286 break;
8287 case CCValAssign::BCvt:
8288 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
8289 break;
8290 case CCValAssign::AExt:
8291 case CCValAssign::ZExt:
8292 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8293 break;
8295 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8296 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8297 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8298 DAG.getConstant(32, DL, VA.getLocVT()));
8299 break;
8300 }
8301
8302 if (RegsUsed.count(VA.getLocReg())) {
8303 SDValue &Bits =
8304 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
8305 return Elt.first == VA.getLocReg();
8306 })->second;
8307 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8308 } else {
8309 RetVals.emplace_back(VA.getLocReg(), Arg);
8310 RegsUsed.insert(VA.getLocReg());
8311 }
8312 }
8313
8314 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8315
8316 // Emit SMSTOP before returning from a locally streaming function
8317 SMEAttrs FuncAttrs(MF.getFunction());
8318 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8319 if (FuncAttrs.hasStreamingCompatibleInterface()) {
8320 Register Reg = FuncInfo->getPStateSMReg();
8321 assert(Reg.isValid() && "PStateSM Register is invalid");
8322 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8323 Chain =
8324 changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8325 /*Glue*/ SDValue(), PStateSM, /*Entry*/ false);
8326 } else
8327 Chain = changeStreamingMode(
8328 DAG, DL, /*Enable*/ false, Chain,
8329 /*Glue*/ SDValue(), DAG.getConstant(1, DL, MVT::i64), /*Entry*/ true);
8330 Glue = Chain.getValue(1);
8331 }
8332
8333 SmallVector<SDValue, 4> RetOps(1, Chain);
8334 for (auto &RetVal : RetVals) {
8335 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
8336 Glue = Chain.getValue(1);
8337 RetOps.push_back(
8338 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
8339 }
8340
8341 // Windows AArch64 ABIs require that for returning structs by value we copy
8342 // the sret argument into X0 for the return.
8343 // We saved the argument into a virtual register in the entry block,
8344 // so now we copy the value out and into X0.
8345 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8346 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
8348
8349 unsigned RetValReg = AArch64::X0;
8350 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8351 RetValReg = AArch64::X8;
8352 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
8353 Glue = Chain.getValue(1);
8354
8355 RetOps.push_back(
8356 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
8357 }
8358
8359 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
8360 if (I) {
8361 for (; *I; ++I) {
8362 if (AArch64::GPR64RegClass.contains(*I))
8363 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
8364 else if (AArch64::FPR64RegClass.contains(*I))
8365 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
8366 else
8367 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
8368 }
8369 }
8370
8371 RetOps[0] = Chain; // Update chain.
8372
8373 // Add the glue if we have it.
8374 if (Glue.getNode())
8375 RetOps.push_back(Glue);
8376
8377 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8378 // ARM64EC entry thunks use a special return sequence: instead of a regular
8379 // "ret" instruction, they need to explicitly call the emulator.
8380 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8381 SDValue Arm64ECRetDest =
8382 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
8383 Arm64ECRetDest =
8384 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
8385 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
8387 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
8388 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
8389 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
8390 }
8391
8392 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
8393}
8394
8395//===----------------------------------------------------------------------===//
8396// Other Lowering Code
8397//===----------------------------------------------------------------------===//
8398
8399SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
8400 SelectionDAG &DAG,
8401 unsigned Flag) const {
8402 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
8403 N->getOffset(), Flag);
8404}
8405
8406SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
8407 SelectionDAG &DAG,
8408 unsigned Flag) const {
8409 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
8410}
8411
8412SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
8413 SelectionDAG &DAG,
8414 unsigned Flag) const {
8415 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
8416 N->getOffset(), Flag);
8417}
8418
8419SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
8420 SelectionDAG &DAG,
8421 unsigned Flag) const {
8422 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
8423}
8424
8425SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
8426 SelectionDAG &DAG,
8427 unsigned Flag) const {
8428 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
8429}
8430
8431// (loadGOT sym)
8432template <class NodeTy>
8433SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
8434 unsigned Flags) const {
8435 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
8436 SDLoc DL(N);
8437 EVT Ty = getPointerTy(DAG.getDataLayout());
8438 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
8439 // FIXME: Once remat is capable of dealing with instructions with register
8440 // operands, expand this into two nodes instead of using a wrapper node.
8441 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
8442}
8443
8444// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
8445template <class NodeTy>
8446SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
8447 unsigned Flags) const {
8448 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
8449 SDLoc DL(N);
8450 EVT Ty = getPointerTy(DAG.getDataLayout());
8451 const unsigned char MO_NC = AArch64II::MO_NC;
8452 return DAG.getNode(
8454 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
8455 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
8456 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
8457 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
8458}
8459
8460// (addlow (adrp %hi(sym)) %lo(sym))
8461template <class NodeTy>
8462SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
8463 unsigned Flags) const {
8464 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
8465 SDLoc DL(N);
8466 EVT Ty = getPointerTy(DAG.getDataLayout());
8467 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
8468 SDValue Lo = getTargetNode(N, Ty, DAG,
8471 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
8472}
8473
8474// (adr sym)
8475template <class NodeTy>
8476SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
8477 unsigned Flags) const {
8478 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
8479 SDLoc DL(N);
8480 EVT Ty = getPointerTy(DAG.getDataLayout());
8481 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
8482 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
8483}
8484
8485SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
8486 SelectionDAG &DAG) const {
8487 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
8488 const GlobalValue *GV = GN->getGlobal();
8489 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
8490
8491 if (OpFlags != AArch64II::MO_NO_FLAG)
8492 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
8493 "unexpected offset in global node");
8494
8495 // This also catches the large code model case for Darwin, and tiny code
8496 // model with got relocations.
8497 if ((OpFlags & AArch64II::MO_GOT) != 0) {
8498 return getGOT(GN, DAG, OpFlags);
8499 }
8500
8504 Result = getAddrLarge(GN, DAG, OpFlags);
8505 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8506 Result = getAddrTiny(GN, DAG, OpFlags);
8507 } else {
8508 Result = getAddr(GN, DAG, OpFlags);
8509 }
8510 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8511 SDLoc DL(GN);
8513 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
8515 return Result;
8516}
8517
8518/// Convert a TLS address reference into the correct sequence of loads
8519/// and calls to compute the variable's address (for Darwin, currently) and
8520/// return an SDValue containing the final node.
8521
8522/// Darwin only has one TLS scheme which must be capable of dealing with the
8523/// fully general situation, in the worst case. This means:
8524/// + "extern __thread" declaration.
8525/// + Defined in a possibly unknown dynamic library.
8526///
8527/// The general system is that each __thread variable has a [3 x i64] descriptor
8528/// which contains information used by the runtime to calculate the address. The
8529/// only part of this the compiler needs to know about is the first xword, which
8530/// contains a function pointer that must be called with the address of the
8531/// entire descriptor in "x0".
8532///
8533/// Since this descriptor may be in a different unit, in general even the
8534/// descriptor must be accessed via an indirect load. The "ideal" code sequence
8535/// is:
8536/// adrp x0, _var@TLVPPAGE
8537/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
8538/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
8539/// ; the function pointer
8540/// blr x1 ; Uses descriptor address in x0
8541/// ; Address of _var is now in x0.
8542///
8543/// If the address of _var's descriptor *is* known to the linker, then it can
8544/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
8545/// a slight efficiency gain.
8546SDValue
8547AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
8548 SelectionDAG &DAG) const {
8549 assert(Subtarget->isTargetDarwin() &&
8550 "This function expects a Darwin target");
8551
8552 SDLoc DL(Op);
8553 MVT PtrVT = getPointerTy(DAG.getDataLayout());
8554 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
8555 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
8556
8557 SDValue TLVPAddr =
8558 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8559 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
8560
8561 // The first entry in the descriptor is a function pointer that we must call
8562 // to obtain the address of the variable.
8563 SDValue Chain = DAG.getEntryNode();
8564 SDValue FuncTLVGet = DAG.getLoad(
8565 PtrMemVT, DL, Chain, DescAddr,
8567 Align(PtrMemVT.getSizeInBits() / 8),
8569 Chain = FuncTLVGet.getValue(1);
8570
8571 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
8572 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
8573
8575 MFI.setAdjustsStack(true);
8576
8577 // TLS calls preserve all registers except those that absolutely must be
8578 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
8579 // silly).
8580 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8581 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
8582 if (Subtarget->hasCustomCallingConv())
8583 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
8584
8585 // Finally, we can make the call. This is just a degenerate version of a
8586 // normal AArch64 call node: x0 takes the address of the descriptor, and
8587 // returns the address of the variable in this thread.
8588 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
8589 Chain =
8590 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
8591 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
8592 DAG.getRegisterMask(Mask), Chain.getValue(1));
8593 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
8594}
8595
8596/// Convert a thread-local variable reference into a sequence of instructions to
8597/// compute the variable's address for the local exec TLS model of ELF targets.
8598/// The sequence depends on the maximum TLS area size.
8599SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
8600 SDValue ThreadBase,
8601 const SDLoc &DL,
8602 SelectionDAG &DAG) const {
8603 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8604 SDValue TPOff, Addr;
8605
8606 switch (DAG.getTarget().Options.TLSSize) {
8607 default:
8608 llvm_unreachable("Unexpected TLS size");
8609
8610 case 12: {
8611 // mrs x0, TPIDR_EL0
8612 // add x0, x0, :tprel_lo12:a
8614 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
8615 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8616 Var,
8617 DAG.getTargetConstant(0, DL, MVT::i32)),
8618 0);
8619 }
8620
8621 case 24: {
8622 // mrs x0, TPIDR_EL0
8623 // add x0, x0, :tprel_hi12:a
8624 // add x0, x0, :tprel_lo12_nc:a
8625 SDValue HiVar = DAG.getTargetGlobalAddress(
8626 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8627 SDValue LoVar = DAG.getTargetGlobalAddress(
8628 GV, DL, PtrVT, 0,
8630 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8631 HiVar,
8632 DAG.getTargetConstant(0, DL, MVT::i32)),
8633 0);
8634 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
8635 LoVar,
8636 DAG.getTargetConstant(0, DL, MVT::i32)),
8637 0);
8638 }
8639
8640 case 32: {
8641 // mrs x1, TPIDR_EL0
8642 // movz x0, #:tprel_g1:a
8643 // movk x0, #:tprel_g0_nc:a
8644 // add x0, x1, x0
8645 SDValue HiVar = DAG.getTargetGlobalAddress(
8646 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
8647 SDValue LoVar = DAG.getTargetGlobalAddress(
8648 GV, DL, PtrVT, 0,
8650 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8651 DAG.getTargetConstant(16, DL, MVT::i32)),
8652 0);
8653 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8654 DAG.getTargetConstant(0, DL, MVT::i32)),
8655 0);
8656 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8657 }
8658
8659 case 48: {
8660 // mrs x1, TPIDR_EL0
8661 // movz x0, #:tprel_g2:a
8662 // movk x0, #:tprel_g1_nc:a
8663 // movk x0, #:tprel_g0_nc:a
8664 // add x0, x1, x0
8665 SDValue HiVar = DAG.getTargetGlobalAddress(
8666 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
8667 SDValue MiVar = DAG.getTargetGlobalAddress(
8668 GV, DL, PtrVT, 0,
8670 SDValue LoVar = DAG.getTargetGlobalAddress(
8671 GV, DL, PtrVT, 0,
8673 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8674 DAG.getTargetConstant(32, DL, MVT::i32)),
8675 0);
8676 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
8677 DAG.getTargetConstant(16, DL, MVT::i32)),
8678 0);
8679 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8680 DAG.getTargetConstant(0, DL, MVT::i32)),
8681 0);
8682 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8683 }
8684 }
8685}
8686
8687/// When accessing thread-local variables under either the general-dynamic or
8688/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
8689/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
8690/// is a function pointer to carry out the resolution.
8691///
8692/// The sequence is:
8693/// adrp x0, :tlsdesc:var
8694/// ldr x1, [x0, #:tlsdesc_lo12:var]
8695/// add x0, x0, #:tlsdesc_lo12:var
8696/// .tlsdesccall var
8697/// blr x1
8698/// (TPIDR_EL0 offset now in x0)
8699///
8700/// The above sequence must be produced unscheduled, to enable the linker to
8701/// optimize/relax this sequence.
8702/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
8703/// above sequence, and expanded really late in the compilation flow, to ensure
8704/// the sequence is produced as per above.
8705SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
8706 const SDLoc &DL,
8707 SelectionDAG &DAG) const {
8708 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8709
8710 SDValue Chain = DAG.getEntryNode();
8711 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8712
8713 Chain =
8714 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
8715 SDValue Glue = Chain.getValue(1);
8716
8717 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
8718}
8719
8720SDValue
8721AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
8722 SelectionDAG &DAG) const {
8723 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
8724
8725 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8726
8728
8730 if (Model == TLSModel::LocalDynamic)
8732 }
8733
8735 Model != TLSModel::LocalExec)
8736 report_fatal_error("ELF TLS only supported in small memory model or "
8737 "in local exec TLS model");
8738 // Different choices can be made for the maximum size of the TLS area for a
8739 // module. For the small address model, the default TLS size is 16MiB and the
8740 // maximum TLS size is 4GiB.
8741 // FIXME: add tiny and large code model support for TLS access models other
8742 // than local exec. We currently generate the same code as small for tiny,
8743 // which may be larger than needed.
8744
8745 SDValue TPOff;
8746 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8747 SDLoc DL(Op);
8748 const GlobalValue *GV = GA->getGlobal();
8749
8750 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
8751
8752 if (Model == TLSModel::LocalExec) {
8753 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
8754 } else if (Model == TLSModel::InitialExec) {
8755 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8756 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
8757 } else if (Model == TLSModel::LocalDynamic) {
8758 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
8759 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
8760 // the beginning of the module's TLS region, followed by a DTPREL offset
8761 // calculation.
8762
8763 // These accesses will need deduplicating if there's more than one.
8764 AArch64FunctionInfo *MFI =
8767
8768 // The call needs a relocation too for linker relaxation. It doesn't make
8769 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
8770 // the address.
8771 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
8773
8774 // Now we can calculate the offset from TPIDR_EL0 to this module's
8775 // thread-local area.
8776 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
8777
8778 // Now use :dtprel_whatever: operations to calculate this variable's offset
8779 // in its thread-storage area.
8780 SDValue HiVar = DAG.getTargetGlobalAddress(
8781 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8782 SDValue LoVar = DAG.getTargetGlobalAddress(
8783 GV, DL, MVT::i64, 0,
8785
8786 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
8787 DAG.getTargetConstant(0, DL, MVT::i32)),
8788 0);
8789 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
8790 DAG.getTargetConstant(0, DL, MVT::i32)),
8791 0);
8792 } else if (Model == TLSModel::GeneralDynamic) {
8793 // The call needs a relocation too for linker relaxation. It doesn't make
8794 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
8795 // the address.
8796 SDValue SymAddr =
8797 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8798
8799 // Finally we can make a call to calculate the offset from tpidr_el0.
8800 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
8801 } else
8802 llvm_unreachable("Unsupported ELF TLS access model");
8803
8804 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8805}
8806
8807SDValue
8808AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
8809 SelectionDAG &DAG) const {
8810 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
8811
8812 SDValue Chain = DAG.getEntryNode();
8813 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8814 SDLoc DL(Op);
8815
8816 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
8817
8818 // Load the ThreadLocalStoragePointer from the TEB
8819 // A pointer to the TLS array is located at offset 0x58 from the TEB.
8820 SDValue TLSArray =
8821 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
8822 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
8823 Chain = TLSArray.getValue(1);
8824
8825 // Load the TLS index from the C runtime;
8826 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
8827 // This also does the same as LOADgot, but using a generic i32 load,
8828 // while LOADgot only loads i64.
8829 SDValue TLSIndexHi =
8830 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
8831 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
8832 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8833 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
8834 SDValue TLSIndex =
8835 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
8836 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
8837 Chain = TLSIndex.getValue(1);
8838
8839 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
8840 // offset into the TLSArray.
8841 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
8842 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
8843 DAG.getConstant(3, DL, PtrVT));
8844 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
8845 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
8847 Chain = TLS.getValue(1);
8848
8849 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8850 const GlobalValue *GV = GA->getGlobal();
8851 SDValue TGAHi = DAG.getTargetGlobalAddress(
8852 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8853 SDValue TGALo = DAG.getTargetGlobalAddress(
8854 GV, DL, PtrVT, 0,
8856
8857 // Add the offset from the start of the .tls section (section base).
8858 SDValue Addr =
8859 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
8860 DAG.getTargetConstant(0, DL, MVT::i32)),
8861 0);
8862 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
8863 return Addr;
8864}
8865
8866SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
8867 SelectionDAG &DAG) const {
8868 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8869 if (DAG.getTarget().useEmulatedTLS())
8870 return LowerToTLSEmulatedModel(GA, DAG);
8871
8872 if (Subtarget->isTargetDarwin())
8873 return LowerDarwinGlobalTLSAddress(Op, DAG);
8874 if (Subtarget->isTargetELF())
8875 return LowerELFGlobalTLSAddress(Op, DAG);
8876 if (Subtarget->isTargetWindows())
8877 return LowerWindowsGlobalTLSAddress(Op, DAG);
8878
8879 llvm_unreachable("Unexpected platform trying to use TLS");
8880}
8881
8882// Looks through \param Val to determine the bit that can be used to
8883// check the sign of the value. It returns the unextended value and
8884// the sign bit position.
8885std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
8886 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
8887 return {Val.getOperand(0),
8888 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
8889 1};
8890
8891 if (Val.getOpcode() == ISD::SIGN_EXTEND)
8892 return {Val.getOperand(0),
8893 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
8894
8895 return {Val, Val.getValueSizeInBits() - 1};
8896}
8897
8898SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
8899 SDValue Chain = Op.getOperand(0);
8900 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
8901 SDValue LHS = Op.getOperand(2);
8902 SDValue RHS = Op.getOperand(3);
8903 SDValue Dest = Op.getOperand(4);
8904 SDLoc dl(Op);
8905
8907 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
8908 // will not be produced, as they are conditional branch instructions that do
8909 // not set flags.
8910 bool ProduceNonFlagSettingCondBr =
8911 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
8912
8913 // Handle f128 first, since lowering it will result in comparing the return
8914 // value of a libcall against zero, which is just what the rest of LowerBR_CC
8915 // is expecting to deal with.
8916 if (LHS.getValueType() == MVT::f128) {
8917 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
8918
8919 // If softenSetCCOperands returned a scalar, we need to compare the result
8920 // against zero to select between true and false values.
8921 if (!RHS.getNode()) {
8922 RHS = DAG.getConstant(0, dl, LHS.getValueType());
8923 CC = ISD::SETNE;
8924 }
8925 }
8926
8927 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
8928 // instruction.
8929 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
8930 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
8931 // Only lower legal XALUO ops.
8932 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
8933 return SDValue();
8934
8935 // The actual operation with overflow check.
8937 SDValue Value, Overflow;
8938 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
8939
8940 if (CC == ISD::SETNE)
8941 OFCC = getInvertedCondCode(OFCC);
8942 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
8943
8944 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
8945 Overflow);
8946 }
8947
8948 if (LHS.getValueType().isInteger()) {
8949 assert((LHS.getValueType() == RHS.getValueType()) &&
8950 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
8951
8952 // If the RHS of the comparison is zero, we can potentially fold this
8953 // to a specialized branch.
8954 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
8955 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
8956 if (CC == ISD::SETEQ) {
8957 // See if we can use a TBZ to fold in an AND as well.
8958 // TBZ has a smaller branch displacement than CBZ. If the offset is
8959 // out of bounds, a late MI-layer pass rewrites branches.
8960 // 403.gcc is an example that hits this case.
8961 if (LHS.getOpcode() == ISD::AND &&
8962 isa<ConstantSDNode>(LHS.getOperand(1)) &&
8963 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
8964 SDValue Test = LHS.getOperand(0);
8965 uint64_t Mask = LHS.getConstantOperandVal(1);
8966 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
8967 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
8968 Dest);
8969 }
8970
8971 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
8972 } else if (CC == ISD::SETNE) {
8973 // See if we can use a TBZ to fold in an AND as well.
8974 // TBZ has a smaller branch displacement than CBZ. If the offset is
8975 // out of bounds, a late MI-layer pass rewrites branches.
8976 // 403.gcc is an example that hits this case.
8977 if (LHS.getOpcode() == ISD::AND &&
8978 isa<ConstantSDNode>(LHS.getOperand(1)) &&
8979 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
8980 SDValue Test = LHS.getOperand(0);
8981 uint64_t Mask = LHS.getConstantOperandVal(1);
8982 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
8983 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
8984 Dest);
8985 }
8986
8987 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
8988 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
8989 // Don't combine AND since emitComparison converts the AND to an ANDS
8990 // (a.k.a. TST) and the test in the test bit and branch instruction
8991 // becomes redundant. This would also increase register pressure.
8992 uint64_t SignBitPos;
8993 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
8994 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
8995 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
8996 }
8997 }
8998 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
8999 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9000 // Don't combine AND since emitComparison converts the AND to an ANDS
9001 // (a.k.a. TST) and the test in the test bit and branch instruction
9002 // becomes redundant. This would also increase register pressure.
9003 uint64_t SignBitPos;
9004 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9005 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9006 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9007 }
9008
9009 SDValue CCVal;
9010 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9011 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9012 Cmp);
9013 }
9014
9015 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9016 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9017
9018 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9019 // clean. Some of them require two branches to implement.
9020 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9021 AArch64CC::CondCode CC1, CC2;
9022 changeFPCCToAArch64CC(CC, CC1, CC2);
9023 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9024 SDValue BR1 =
9025 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9026 if (CC2 != AArch64CC::AL) {
9027 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9028 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9029 Cmp);
9030 }
9031
9032 return BR1;
9033}
9034
9035SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9036 SelectionDAG &DAG) const {
9037 if (!Subtarget->hasNEON())
9038 return SDValue();
9039
9040 EVT VT = Op.getValueType();
9041 EVT IntVT = VT.changeTypeToInteger();
9042 SDLoc DL(Op);
9043
9044 SDValue In1 = Op.getOperand(0);
9045 SDValue In2 = Op.getOperand(1);
9046 EVT SrcVT = In2.getValueType();
9047
9048 if (!SrcVT.bitsEq(VT))
9049 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
9050
9051 if (VT.isScalableVector())
9052 IntVT =
9054
9055 if (VT.isFixedLengthVector() &&
9056 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
9057 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9058
9059 In1 = convertToScalableVector(DAG, ContainerVT, In1);
9060 In2 = convertToScalableVector(DAG, ContainerVT, In2);
9061
9062 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
9063 return convertFromScalableVector(DAG, VT, Res);
9064 }
9065
9066 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9067 if (VT.isScalableVector())
9068 return getSVESafeBitCast(VT, Op, DAG);
9069
9070 return DAG.getBitcast(VT, Op);
9071 };
9072
9073 SDValue VecVal1, VecVal2;
9074 EVT VecVT;
9075 auto SetVecVal = [&](int Idx = -1) {
9076 if (!VT.isVector()) {
9077 VecVal1 =
9078 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
9079 VecVal2 =
9080 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
9081 } else {
9082 VecVal1 = BitCast(VecVT, In1, DAG);
9083 VecVal2 = BitCast(VecVT, In2, DAG);
9084 }
9085 };
9086 if (VT.isVector()) {
9087 VecVT = IntVT;
9088 SetVecVal();
9089 } else if (VT == MVT::f64) {
9090 VecVT = MVT::v2i64;
9091 SetVecVal(AArch64::dsub);
9092 } else if (VT == MVT::f32) {
9093 VecVT = MVT::v4i32;
9094 SetVecVal(AArch64::ssub);
9095 } else if (VT == MVT::f16) {
9096 VecVT = MVT::v8i16;
9097 SetVecVal(AArch64::hsub);
9098 } else {
9099 llvm_unreachable("Invalid type for copysign!");
9100 }
9101
9102 unsigned BitWidth = In1.getScalarValueSizeInBits();
9103 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
9104
9105 // We want to materialize a mask with every bit but the high bit set, but the
9106 // AdvSIMD immediate moves cannot materialize that in a single instruction for
9107 // 64-bit elements. Instead, materialize all bits set and then negate that.
9108 if (VT == MVT::f64 || VT == MVT::v2f64) {
9109 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
9110 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9111 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9112 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9113 }
9114
9115 SDValue BSP =
9116 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
9117 if (VT == MVT::f16)
9118 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9119 if (VT == MVT::f32)
9120 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9121 if (VT == MVT::f64)
9122 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9123
9124 return BitCast(VT, BSP, DAG);
9125}
9126
9127SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9128 SelectionDAG &DAG) const {
9130 Attribute::NoImplicitFloat))
9131 return SDValue();
9132
9133 if (!Subtarget->hasNEON())
9134 return SDValue();
9135
9136 bool IsParity = Op.getOpcode() == ISD::PARITY;
9137 SDValue Val = Op.getOperand(0);
9138 SDLoc DL(Op);
9139 EVT VT = Op.getValueType();
9140
9141 // for i32, general parity function using EORs is more efficient compared to
9142 // using floating point
9143 if (VT == MVT::i32 && IsParity)
9144 return SDValue();
9145
9146 // If there is no CNT instruction available, GPR popcount can
9147 // be more efficiently lowered to the following sequence that uses
9148 // AdvSIMD registers/instructions as long as the copies to/from
9149 // the AdvSIMD registers are cheap.
9150 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
9151 // CNT V0.8B, V0.8B // 8xbyte pop-counts
9152 // ADDV B0, V0.8B // sum 8xbyte pop-counts
9153 // UMOV X0, V0.B[0] // copy byte result back to integer reg
9154 if (VT == MVT::i32 || VT == MVT::i64) {
9155 if (VT == MVT::i32)
9156 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9157 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9158
9159 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9160 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9161 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9162 DAG.getConstant(0, DL, MVT::i64));
9163
9164 if (IsParity)
9165 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9166 DAG.getConstant(1, DL, MVT::i32));
9167
9168 if (VT == MVT::i64)
9169 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9170 return UaddLV;
9171 } else if (VT == MVT::i128) {
9172 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9173
9174 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9175 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9176 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9177 DAG.getConstant(0, DL, MVT::i64));
9178
9179 if (IsParity)
9180 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9181 DAG.getConstant(1, DL, MVT::i32));
9182
9183 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9184 }
9185
9186 assert(!IsParity && "ISD::PARITY of vector types not supported");
9187
9188 if (VT.isScalableVector() ||
9190 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
9191
9192 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
9193 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
9194 "Unexpected type for custom ctpop lowering");
9195
9196 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9197 Val = DAG.getBitcast(VT8Bit, Val);
9198 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
9199
9200 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9201 unsigned EltSize = 8;
9202 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
9203 while (EltSize != VT.getScalarSizeInBits()) {
9204 EltSize *= 2;
9205 NumElts /= 2;
9206 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
9207 Val = DAG.getNode(
9208 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
9209 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
9210 }
9211
9212 return Val;
9213}
9214
9215SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9216 EVT VT = Op.getValueType();
9217 assert(VT.isScalableVector() ||
9219 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
9220
9221 SDLoc DL(Op);
9222 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
9223 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
9224}
9225
9226SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
9227 SelectionDAG &DAG) const {
9228
9229 EVT VT = Op.getValueType();
9230 SDLoc DL(Op);
9231 unsigned Opcode = Op.getOpcode();
9233 switch (Opcode) {
9234 default:
9235 llvm_unreachable("Wrong instruction");
9236 case ISD::SMAX:
9237 CC = ISD::SETGT;
9238 break;
9239 case ISD::SMIN:
9240 CC = ISD::SETLT;
9241 break;
9242 case ISD::UMAX:
9243 CC = ISD::SETUGT;
9244 break;
9245 case ISD::UMIN:
9246 CC = ISD::SETULT;
9247 break;
9248 }
9249
9250 if (VT.isScalableVector() ||
9252 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
9253 switch (Opcode) {
9254 default:
9255 llvm_unreachable("Wrong instruction");
9256 case ISD::SMAX:
9257 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
9258 case ISD::SMIN:
9259 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
9260 case ISD::UMAX:
9261 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
9262 case ISD::UMIN:
9263 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
9264 }
9265 }
9266
9267 SDValue Op0 = Op.getOperand(0);
9268 SDValue Op1 = Op.getOperand(1);
9269 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
9270 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
9271}
9272
9273SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
9274 SelectionDAG &DAG) const {
9275 EVT VT = Op.getValueType();
9276
9277 if (VT.isScalableVector() ||
9279 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
9280 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
9281
9282 SDLoc DL(Op);
9283 SDValue REVB;
9284 MVT VST;
9285
9286 switch (VT.getSimpleVT().SimpleTy) {
9287 default:
9288 llvm_unreachable("Invalid type for bitreverse!");
9289
9290 case MVT::v2i32: {
9291 VST = MVT::v8i8;
9292 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9293
9294 break;
9295 }
9296
9297 case MVT::v4i32: {
9298 VST = MVT::v16i8;
9299 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9300
9301 break;
9302 }
9303
9304 case MVT::v1i64: {
9305 VST = MVT::v8i8;
9306 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9307
9308 break;
9309 }
9310
9311 case MVT::v2i64: {
9312 VST = MVT::v16i8;
9313 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9314
9315 break;
9316 }
9317 }
9318
9319 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
9320 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
9321}
9322
9323// Check whether the continuous comparison sequence.
9324static bool
9325isOrXorChain(SDValue N, unsigned &Num,
9326 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
9327 if (Num == MaxXors)
9328 return false;
9329
9330 // Skip the one-use zext
9331 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
9332 N = N->getOperand(0);
9333
9334 // The leaf node must be XOR
9335 if (N->getOpcode() == ISD::XOR) {
9336 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
9337 Num++;
9338 return true;
9339 }
9340
9341 // All the non-leaf nodes must be OR.
9342 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
9343 return false;
9344
9345 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
9346 isOrXorChain(N->getOperand(1), Num, WorkList))
9347 return true;
9348 return false;
9349}
9350
9351// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
9353 SDValue LHS = N->getOperand(0);
9354 SDValue RHS = N->getOperand(1);
9355 SDLoc DL(N);
9356 EVT VT = N->getValueType(0);
9358
9359 // Only handle integer compares.
9360 if (N->getOpcode() != ISD::SETCC)
9361 return SDValue();
9362
9363 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
9364 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
9365 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
9366 unsigned NumXors = 0;
9367 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
9368 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
9369 isOrXorChain(LHS, NumXors, WorkList)) {
9370 SDValue XOR0, XOR1;
9371 std::tie(XOR0, XOR1) = WorkList[0];
9372 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
9373 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9374 for (unsigned I = 1; I < WorkList.size(); I++) {
9375 std::tie(XOR0, XOR1) = WorkList[I];
9376 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9377 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
9378 }
9379
9380 // Exit early by inverting the condition, which help reduce indentations.
9381 return Cmp;
9382 }
9383
9384 return SDValue();
9385}
9386
9387SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9388
9389 if (Op.getValueType().isVector())
9390 return LowerVSETCC(Op, DAG);
9391
9392 bool IsStrict = Op->isStrictFPOpcode();
9393 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
9394 unsigned OpNo = IsStrict ? 1 : 0;
9395 SDValue Chain;
9396 if (IsStrict)
9397 Chain = Op.getOperand(0);
9398 SDValue LHS = Op.getOperand(OpNo + 0);
9399 SDValue RHS = Op.getOperand(OpNo + 1);
9400 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
9401 SDLoc dl(Op);
9402
9403 // We chose ZeroOrOneBooleanContents, so use zero and one.
9404 EVT VT = Op.getValueType();
9405 SDValue TVal = DAG.getConstant(1, dl, VT);
9406 SDValue FVal = DAG.getConstant(0, dl, VT);
9407
9408 // Handle f128 first, since one possible outcome is a normal integer
9409 // comparison which gets picked up by the next if statement.
9410 if (LHS.getValueType() == MVT::f128) {
9411 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
9412 IsSignaling);
9413
9414 // If softenSetCCOperands returned a scalar, use it.
9415 if (!RHS.getNode()) {
9416 assert(LHS.getValueType() == Op.getValueType() &&
9417 "Unexpected setcc expansion!");
9418 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
9419 }
9420 }
9421
9422 if (LHS.getValueType().isInteger()) {
9423 SDValue CCVal;
9425 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
9426
9427 // Note that we inverted the condition above, so we reverse the order of
9428 // the true and false operands here. This will allow the setcc to be
9429 // matched to a single CSINC instruction.
9430 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
9431 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
9432 }
9433
9434 // Now we know we're dealing with FP values.
9435 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
9436 LHS.getValueType() == MVT::f64);
9437
9438 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
9439 // and do the comparison.
9440 SDValue Cmp;
9441 if (IsStrict)
9442 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
9443 else
9444 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9445
9446 AArch64CC::CondCode CC1, CC2;
9447 changeFPCCToAArch64CC(CC, CC1, CC2);
9448 SDValue Res;
9449 if (CC2 == AArch64CC::AL) {
9450 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
9451 CC2);
9452 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9453
9454 // Note that we inverted the condition above, so we reverse the order of
9455 // the true and false operands here. This will allow the setcc to be
9456 // matched to a single CSINC instruction.
9457 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
9458 } else {
9459 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
9460 // totally clean. Some of them require two CSELs to implement. As is in
9461 // this case, we emit the first CSEL and then emit a second using the output
9462 // of the first as the RHS. We're effectively OR'ing the two CC's together.
9463
9464 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
9465 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9466 SDValue CS1 =
9467 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9468
9469 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9470 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9471 }
9472 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
9473}
9474
9475SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
9476 SelectionDAG &DAG) const {
9477
9478 SDValue LHS = Op.getOperand(0);
9479 SDValue RHS = Op.getOperand(1);
9480 EVT VT = LHS.getValueType();
9481 if (VT != MVT::i32 && VT != MVT::i64)
9482 return SDValue();
9483
9484 SDLoc DL(Op);
9485 SDValue Carry = Op.getOperand(2);
9486 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
9487 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
9488 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
9489 LHS, RHS, InvCarry);
9490
9491 EVT OpVT = Op.getValueType();
9492 SDValue TVal = DAG.getConstant(1, DL, OpVT);
9493 SDValue FVal = DAG.getConstant(0, DL, OpVT);
9494
9495 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
9497 SDValue CCVal =
9498 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
9499 // Inputs are swapped because the condition is inverted. This will allow
9500 // matching with a single CSINC instruction.
9501 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
9502 Cmp.getValue(1));
9503}
9504
9505SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
9506 SDValue RHS, SDValue TVal,
9507 SDValue FVal, const SDLoc &dl,
9508 SelectionDAG &DAG) const {
9509 // Handle f128 first, because it will result in a comparison of some RTLIB
9510 // call result against zero.
9511 if (LHS.getValueType() == MVT::f128) {
9512 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9513
9514 // If softenSetCCOperands returned a scalar, we need to compare the result
9515 // against zero to select between true and false values.
9516 if (!RHS.getNode()) {
9517 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9518 CC = ISD::SETNE;
9519 }
9520 }
9521
9522 // Also handle f16, for which we need to do a f32 comparison.
9523 if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
9524 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
9525 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
9526 }
9527
9528 // Next, handle integers.
9529 if (LHS.getValueType().isInteger()) {
9530 assert((LHS.getValueType() == RHS.getValueType()) &&
9531 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9532
9533 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
9534 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
9535 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9536 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
9537 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
9538 // supported types.
9539 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
9540 CTVal->isOne() && CFVal->isAllOnes() &&
9541 LHS.getValueType() == TVal.getValueType()) {
9542 EVT VT = LHS.getValueType();
9543 SDValue Shift =
9544 DAG.getNode(ISD::SRA, dl, VT, LHS,
9545 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9546 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
9547 }
9548
9549 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
9550 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
9551 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
9552 // Both require less instructions than compare and conditional select.
9553 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
9554 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
9555 LHS.getValueType() == RHS.getValueType()) {
9556 EVT VT = LHS.getValueType();
9557 SDValue Shift =
9558 DAG.getNode(ISD::SRA, dl, VT, LHS,
9559 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9560
9561 if (CC == ISD::SETGT)
9562 Shift = DAG.getNOT(dl, Shift, VT);
9563
9564 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
9565 }
9566
9567 unsigned Opcode = AArch64ISD::CSEL;
9568
9569 // If both the TVal and the FVal are constants, see if we can swap them in
9570 // order to for a CSINV or CSINC out of them.
9571 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
9572 std::swap(TVal, FVal);
9573 std::swap(CTVal, CFVal);
9574 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9575 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
9576 std::swap(TVal, FVal);
9577 std::swap(CTVal, CFVal);
9578 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9579 } else if (TVal.getOpcode() == ISD::XOR) {
9580 // If TVal is a NOT we want to swap TVal and FVal so that we can match
9581 // with a CSINV rather than a CSEL.
9582 if (isAllOnesConstant(TVal.getOperand(1))) {
9583 std::swap(TVal, FVal);
9584 std::swap(CTVal, CFVal);
9585 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9586 }
9587 } else if (TVal.getOpcode() == ISD::SUB) {
9588 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
9589 // that we can match with a CSNEG rather than a CSEL.
9590 if (isNullConstant(TVal.getOperand(0))) {
9591 std::swap(TVal, FVal);
9592 std::swap(CTVal, CFVal);
9593 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9594 }
9595 } else if (CTVal && CFVal) {
9596 const int64_t TrueVal = CTVal->getSExtValue();
9597 const int64_t FalseVal = CFVal->getSExtValue();
9598 bool Swap = false;
9599
9600 // If both TVal and FVal are constants, see if FVal is the
9601 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
9602 // instead of a CSEL in that case.
9603 if (TrueVal == ~FalseVal) {
9604 Opcode = AArch64ISD::CSINV;
9605 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
9606 TrueVal == -FalseVal) {
9607 Opcode = AArch64ISD::CSNEG;
9608 } else if (TVal.getValueType() == MVT::i32) {
9609 // If our operands are only 32-bit wide, make sure we use 32-bit
9610 // arithmetic for the check whether we can use CSINC. This ensures that
9611 // the addition in the check will wrap around properly in case there is
9612 // an overflow (which would not be the case if we do the check with
9613 // 64-bit arithmetic).
9614 const uint32_t TrueVal32 = CTVal->getZExtValue();
9615 const uint32_t FalseVal32 = CFVal->getZExtValue();
9616
9617 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
9618 Opcode = AArch64ISD::CSINC;
9619
9620 if (TrueVal32 > FalseVal32) {
9621 Swap = true;
9622 }
9623 }
9624 } else {
9625 // 64-bit check whether we can use CSINC.
9626 const uint64_t TrueVal64 = TrueVal;
9627 const uint64_t FalseVal64 = FalseVal;
9628
9629 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
9630 Opcode = AArch64ISD::CSINC;
9631
9632 if (TrueVal > FalseVal) {
9633 Swap = true;
9634 }
9635 }
9636 }
9637
9638 // Swap TVal and FVal if necessary.
9639 if (Swap) {
9640 std::swap(TVal, FVal);
9641 std::swap(CTVal, CFVal);
9642 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9643 }
9644
9645 if (Opcode != AArch64ISD::CSEL) {
9646 // Drop FVal since we can get its value by simply inverting/negating
9647 // TVal.
9648 FVal = TVal;
9649 }
9650 }
9651
9652 // Avoid materializing a constant when possible by reusing a known value in
9653 // a register. However, don't perform this optimization if the known value
9654 // is one, zero or negative one in the case of a CSEL. We can always
9655 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
9656 // FVal, respectively.
9657 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
9658 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
9659 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
9661 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
9662 // "a != C ? x : a" to avoid materializing C.
9663 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
9664 TVal = LHS;
9665 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
9666 FVal = LHS;
9667 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
9668 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
9669 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
9670 // avoid materializing C.
9672 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
9673 Opcode = AArch64ISD::CSINV;
9674 TVal = LHS;
9675 FVal = DAG.getConstant(0, dl, FVal.getValueType());
9676 }
9677 }
9678
9679 SDValue CCVal;
9680 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9681 EVT VT = TVal.getValueType();
9682 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
9683 }
9684
9685 // Now we know we're dealing with FP values.
9686 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
9687 LHS.getValueType() == MVT::f64);
9688 assert(LHS.getValueType() == RHS.getValueType());
9689 EVT VT = TVal.getValueType();
9690 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9691
9692 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9693 // clean. Some of them require two CSELs to implement.
9694 AArch64CC::CondCode CC1, CC2;
9695 changeFPCCToAArch64CC(CC, CC1, CC2);
9696
9697 if (DAG.getTarget().Options.UnsafeFPMath) {
9698 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
9699 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
9700 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
9701 if (RHSVal && RHSVal->isZero()) {
9702 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
9703 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
9704
9705 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
9706 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
9707 TVal = LHS;
9708 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
9709 CFVal && CFVal->isZero() &&
9710 FVal.getValueType() == LHS.getValueType())
9711 FVal = LHS;
9712 }
9713 }
9714
9715 // Emit first, and possibly only, CSEL.
9716 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9717 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9718
9719 // If we need a second CSEL, emit it, using the output of the first as the
9720 // RHS. We're effectively OR'ing the two CC's together.
9721 if (CC2 != AArch64CC::AL) {
9722 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9723 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9724 }
9725
9726 // Otherwise, return the output of the first CSEL.
9727 return CS1;
9728}
9729
9730SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
9731 SelectionDAG &DAG) const {
9732 EVT Ty = Op.getValueType();
9733 auto Idx = Op.getConstantOperandAPInt(2);
9734 int64_t IdxVal = Idx.getSExtValue();
9735 assert(Ty.isScalableVector() &&
9736 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
9737
9738 // We can use the splice instruction for certain index values where we are
9739 // able to efficiently generate the correct predicate. The index will be
9740 // inverted and used directly as the input to the ptrue instruction, i.e.
9741 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
9742 // splice predicate. However, we can only do this if we can guarantee that
9743 // there are enough elements in the vector, hence we check the index <= min
9744 // number of elements.
9745 std::optional<unsigned> PredPattern;
9746 if (Ty.isScalableVector() && IdxVal < 0 &&
9747 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
9748 std::nullopt) {
9749 SDLoc DL(Op);
9750
9751 // Create a predicate where all but the last -IdxVal elements are false.
9752 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
9753 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
9754 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
9755
9756 // Now splice the two inputs together using the predicate.
9757 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
9758 Op.getOperand(1));
9759 }
9760
9761 // This will select to an EXT instruction, which has a maximum immediate
9762 // value of 255, hence 2048-bits is the maximum value we can lower.
9763 if (IdxVal >= 0 &&
9764 IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
9765 return Op;
9766
9767 return SDValue();
9768}
9769
9770SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
9771 SelectionDAG &DAG) const {
9772 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
9773 SDValue LHS = Op.getOperand(0);
9774 SDValue RHS = Op.getOperand(1);
9775 SDValue TVal = Op.getOperand(2);
9776 SDValue FVal = Op.getOperand(3);
9777 SDLoc DL(Op);
9778 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
9779}
9780
9781SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
9782 SelectionDAG &DAG) const {
9783 SDValue CCVal = Op->getOperand(0);
9784 SDValue TVal = Op->getOperand(1);
9785 SDValue FVal = Op->getOperand(2);
9786 SDLoc DL(Op);
9787
9788 EVT Ty = Op.getValueType();
9789 if (Ty == MVT::aarch64svcount) {
9790 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
9791 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
9792 SDValue Sel =
9793 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
9794 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
9795 }
9796
9797 if (Ty.isScalableVector()) {
9798 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
9799 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
9800 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
9801 }
9802
9803 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
9804 // FIXME: Ideally this would be the same as above using i1 types, however
9805 // for the moment we can't deal with fixed i1 vector types properly, so
9806 // instead extend the predicate to a result type sized integer vector.
9807 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
9808 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
9809 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
9810 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
9811 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
9812 }
9813
9814 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
9815 // instruction.
9816 if (ISD::isOverflowIntrOpRes(CCVal)) {
9817 // Only lower legal XALUO ops.
9818 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
9819 return SDValue();
9820
9822 SDValue Value, Overflow;
9823 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
9824 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
9825
9826 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
9827 CCVal, Overflow);
9828 }
9829
9830 // Lower it the same way as we would lower a SELECT_CC node.
9832 SDValue LHS, RHS;
9833 if (CCVal.getOpcode() == ISD::SETCC) {
9834 LHS = CCVal.getOperand(0);
9835 RHS = CCVal.getOperand(1);
9836 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
9837 } else {
9838 LHS = CCVal;
9839 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
9840 CC = ISD::SETNE;
9841 }
9842
9843 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
9844 // order to use FCSELSrrr
9845 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
9846 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
9847 DAG.getUNDEF(MVT::f32), TVal);
9848 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
9849 DAG.getUNDEF(MVT::f32), FVal);
9850 }
9851
9852 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
9853
9854 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
9855 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
9856 }
9857
9858 return Res;
9859}
9860
9861SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
9862 SelectionDAG &DAG) const {
9863 // Jump table entries as PC relative offsets. No additional tweaking
9864 // is necessary here. Just get the address of the jump table.
9865 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
9866
9869 !Subtarget->isTargetMachO())
9870 return getAddrLarge(JT, DAG);
9871 if (CM == CodeModel::Tiny)
9872 return getAddrTiny(JT, DAG);
9873 return getAddr(JT, DAG);
9874}
9875
9876SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
9877 SelectionDAG &DAG) const {
9878 // Jump table entries as PC relative offsets. No additional tweaking
9879 // is necessary here. Just get the address of the jump table.
9880 SDLoc DL(Op);
9881 SDValue JT = Op.getOperand(1);
9882 SDValue Entry = Op.getOperand(2);
9883 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
9884
9885 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
9886 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
9887
9888 SDNode *Dest =
9889 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
9890 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
9891 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
9892 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
9893}
9894
9895SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
9896 SelectionDAG &DAG) const {
9897 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
9899 if (CM == CodeModel::Large) {
9900 // Use the GOT for the large code model on iOS.
9901 if (Subtarget->isTargetMachO()) {
9902 return getGOT(CP, DAG);
9903 }
9905 return getAddrLarge(CP, DAG);
9906 } else if (CM == CodeModel::Tiny) {
9907 return getAddrTiny(CP, DAG);
9908 }
9909 return getAddr(CP, DAG);
9910}
9911
9912SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
9913 SelectionDAG &DAG) const {
9914 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
9916 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
9918 return getAddrLarge(BA, DAG);
9919 } else if (CM == CodeModel::Tiny) {
9920 return getAddrTiny(BA, DAG);
9921 }
9922 return getAddr(BA, DAG);
9923}
9924
9925SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
9926 SelectionDAG &DAG) const {
9927 AArch64FunctionInfo *FuncInfo =
9929
9930 SDLoc DL(Op);
9931 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
9933 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
9934 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9935 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9936 MachinePointerInfo(SV));
9937}
9938
9939SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
9940 SelectionDAG &DAG) const {
9943
9944 SDLoc DL(Op);
9945 SDValue FR;
9946 if (Subtarget->isWindowsArm64EC()) {
9947 // With the Arm64EC ABI, we compute the address of the varargs save area
9948 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
9949 // but calls from an entry thunk can pass in a different address.
9950 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
9951 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
9953 if (FuncInfo->getVarArgsGPRSize() > 0)
9954 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
9955 else
9956 StackOffset = FuncInfo->getVarArgsStackOffset();
9957 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
9958 DAG.getConstant(StackOffset, DL, MVT::i64));
9959 } else {
9960 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
9961 ? FuncInfo->getVarArgsGPRIndex()
9962 : FuncInfo->getVarArgsStackIndex(),
9964 }
9965 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9966 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9967 MachinePointerInfo(SV));
9968}
9969
9970SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
9971 SelectionDAG &DAG) const {
9972 // The layout of the va_list struct is specified in the AArch64 Procedure Call
9973 // Standard, section B.3.
9976 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
9977 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
9978 auto PtrVT = getPointerTy(DAG.getDataLayout());
9979 SDLoc DL(Op);
9980
9981 SDValue Chain = Op.getOperand(0);
9982 SDValue VAList = Op.getOperand(1);
9983 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9985
9986 // void *__stack at offset 0
9987 unsigned Offset = 0;
9988 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
9989 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
9990 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
9991 MachinePointerInfo(SV), Align(PtrSize)));
9992
9993 // void *__gr_top at offset 8 (4 on ILP32)
9994 Offset += PtrSize;
9995 int GPRSize = FuncInfo->getVarArgsGPRSize();
9996 if (GPRSize > 0) {
9997 SDValue GRTop, GRTopAddr;
9998
9999 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10000 DAG.getConstant(Offset, DL, PtrVT));
10001
10002 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
10003 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
10004 DAG.getConstant(GPRSize, DL, PtrVT));
10005 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
10006
10007 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
10009 Align(PtrSize)));
10010 }
10011
10012 // void *__vr_top at offset 16 (8 on ILP32)
10013 Offset += PtrSize;
10014 int FPRSize = FuncInfo->getVarArgsFPRSize();
10015 if (FPRSize > 0) {
10016 SDValue VRTop, VRTopAddr;
10017 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10018 DAG.getConstant(Offset, DL, PtrVT));
10019
10020 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
10021 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
10022 DAG.getConstant(FPRSize, DL, PtrVT));
10023 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
10024
10025 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
10027 Align(PtrSize)));
10028 }
10029
10030 // int __gr_offs at offset 24 (12 on ILP32)
10031 Offset += PtrSize;
10032 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10033 DAG.getConstant(Offset, DL, PtrVT));
10034 MemOps.push_back(
10035 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10036 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10037
10038 // int __vr_offs at offset 28 (16 on ILP32)
10039 Offset += 4;
10040 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10041 DAG.getConstant(Offset, DL, PtrVT));
10042 MemOps.push_back(
10043 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10044 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10045
10046 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10047}
10048
10049SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10050 SelectionDAG &DAG) const {
10052
10053 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
10054 return LowerWin64_VASTART(Op, DAG);
10055 else if (Subtarget->isTargetDarwin())
10056 return LowerDarwin_VASTART(Op, DAG);
10057 else
10058 return LowerAAPCS_VASTART(Op, DAG);
10059}
10060
10061SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10062 SelectionDAG &DAG) const {
10063 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10064 // pointer.
10065 SDLoc DL(Op);
10066 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10067 unsigned VaListSize =
10068 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
10069 ? PtrSize
10070 : Subtarget->isTargetILP32() ? 20 : 32;
10071 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10072 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10073
10074 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
10075 DAG.getConstant(VaListSize, DL, MVT::i32),
10076 Align(PtrSize), false, false, false,
10077 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10078}
10079
10080SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10081 assert(Subtarget->isTargetDarwin() &&
10082 "automatic va_arg instruction only works on Darwin");
10083
10084 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10085 EVT VT = Op.getValueType();
10086 SDLoc DL(Op);
10087 SDValue Chain = Op.getOperand(0);
10088 SDValue Addr = Op.getOperand(1);
10089 MaybeAlign Align(Op.getConstantOperandVal(3));
10090 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
10091 auto PtrVT = getPointerTy(DAG.getDataLayout());
10092 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10093 SDValue VAList =
10094 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
10095 Chain = VAList.getValue(1);
10096 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
10097
10098 if (VT.isScalableVector())
10099 report_fatal_error("Passing SVE types to variadic functions is "
10100 "currently not supported");
10101
10102 if (Align && *Align > MinSlotSize) {
10103 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10104 DAG.getConstant(Align->value() - 1, DL, PtrVT));
10105 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
10106 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
10107 }
10108
10109 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
10110 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
10111
10112 // Scalar integer and FP values smaller than 64 bits are implicitly extended
10113 // up to 64 bits. At the very least, we have to increase the striding of the
10114 // vaargs list to match this, and for FP values we need to introduce
10115 // FP_ROUND nodes as well.
10116 if (VT.isInteger() && !VT.isVector())
10117 ArgSize = std::max(ArgSize, MinSlotSize);
10118 bool NeedFPTrunc = false;
10119 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10120 ArgSize = 8;
10121 NeedFPTrunc = true;
10122 }
10123
10124 // Increment the pointer, VAList, to the next vaarg
10125 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10126 DAG.getConstant(ArgSize, DL, PtrVT));
10127 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
10128
10129 // Store the incremented VAList to the legalized pointer
10130 SDValue APStore =
10131 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
10132
10133 // Load the actual argument out of the pointer VAList
10134 if (NeedFPTrunc) {
10135 // Load the value as an f64.
10136 SDValue WideFP =
10137 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10138 // Round the value down to an f32.
10139 SDValue NarrowFP =
10140 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
10141 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
10142 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
10143 // Merge the rounded value with the chain output of the load.
10144 return DAG.getMergeValues(Ops, DL);
10145 }
10146
10147 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
10148}
10149
10150SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10151 SelectionDAG &DAG) const {
10153 MFI.setFrameAddressIsTaken(true);
10154
10155 EVT VT = Op.getValueType();
10156 SDLoc DL(Op);
10157 unsigned Depth = Op.getConstantOperandVal(0);
10158 SDValue FrameAddr =
10159 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10160 while (Depth--)
10161 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
10163
10164 if (Subtarget->isTargetILP32())
10165 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10166 DAG.getValueType(VT));
10167
10168 return FrameAddr;
10169}
10170
10171SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10172 SelectionDAG &DAG) const {
10174
10175 EVT VT = getPointerTy(DAG.getDataLayout());
10176 SDLoc DL(Op);
10177 int FI = MFI.CreateFixedObject(4, 0, false);
10178 return DAG.getFrameIndex(FI, VT);
10179}
10180
10181#define GET_REGISTER_MATCHER
10182#include "AArch64GenAsmMatcher.inc"
10183
10184// FIXME? Maybe this could be a TableGen attribute on some registers and
10185// this table could be generated automatically from RegInfo.
10186Register AArch64TargetLowering::
10187getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10189 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10190 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10191 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10192 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
10193 !MRI->isReservedReg(MF, Reg))
10194 Reg = 0;
10195 }
10196 if (Reg)
10197 return Reg;
10198 report_fatal_error(Twine("Invalid register name \""
10199 + StringRef(RegName) + "\"."));
10200}
10201
10202SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10203 SelectionDAG &DAG) const {
10205
10206 EVT VT = Op.getValueType();
10207 SDLoc DL(Op);
10208
10209 SDValue FrameAddr =
10210 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10212
10213 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
10214}
10215
10216SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10217 SelectionDAG &DAG) const {
10219 MachineFrameInfo &MFI = MF.getFrameInfo();
10220 MFI.setReturnAddressIsTaken(true);
10221
10222 EVT VT = Op.getValueType();
10223 SDLoc DL(Op);
10224 unsigned Depth = Op.getConstantOperandVal(0);
10225 SDValue ReturnAddress;
10226 if (Depth) {
10227 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10229 ReturnAddress = DAG.getLoad(
10230 VT, DL, DAG.getEntryNode(),
10231 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
10232 } else {
10233 // Return LR, which contains the return address. Mark it an implicit
10234 // live-in.
10235 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
10236 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
10237 }
10238
10239 // The XPACLRI instruction assembles to a hint-space instruction before
10240 // Armv8.3-A therefore this instruction can be safely used for any pre
10241 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
10242 // that instead.
10243 SDNode *St;
10244 if (Subtarget->hasPAuth()) {
10245 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
10246 } else {
10247 // XPACLRI operates on LR therefore we must move the operand accordingly.
10248 SDValue Chain =
10249 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
10250 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
10251 }
10252 return SDValue(St, 0);
10253}
10254
10255/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
10256/// i32 values and take a 2 x i32 value to shift plus a shift amount.
10257SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
10258 SelectionDAG &DAG) const {
10259 SDValue Lo, Hi;
10260 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
10261 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
10262}
10263
10265 const GlobalAddressSDNode *GA) const {
10266 // Offsets are folded in the DAG combine rather than here so that we can
10267 // intelligently choose an offset based on the uses.
10268 return false;
10269}
10270
10272 bool OptForSize) const {
10273 bool IsLegal = false;
10274 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
10275 // 16-bit case when target has full fp16 support.
10276 // FIXME: We should be able to handle f128 as well with a clever lowering.
10277 const APInt ImmInt = Imm.bitcastToAPInt();
10278 if (VT == MVT::f64)
10279 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
10280 else if (VT == MVT::f32)
10281 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
10282 else if (VT == MVT::f16 || VT == MVT::bf16)
10283 IsLegal =
10284 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
10285 Imm.isPosZero();
10286
10287 // If we can not materialize in immediate field for fmov, check if the
10288 // value can be encoded as the immediate operand of a logical instruction.
10289 // The immediate value will be created with either MOVZ, MOVN, or ORR.
10290 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
10291 // generate that fmov.
10292 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
10293 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
10294 // however the mov+fmov sequence is always better because of the reduced
10295 // cache pressure. The timings are still the same if you consider
10296 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
10297 // movw+movk is fused). So we limit up to 2 instrdduction at most.
10300 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
10301 IsLegal = Insn.size() <= Limit;
10302 }
10303
10304 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
10305 << " imm value: "; Imm.dump(););
10306 return IsLegal;
10307}
10308
10309//===----------------------------------------------------------------------===//
10310// AArch64 Optimization Hooks
10311//===----------------------------------------------------------------------===//
10312
10313static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
10314 SDValue Operand, SelectionDAG &DAG,
10315 int &ExtraSteps) {
10316 EVT VT = Operand.getValueType();
10317 if ((ST->hasNEON() &&
10318 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
10319 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
10320 VT == MVT::v4f32)) ||
10321 (ST->hasSVE() &&
10322 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
10324 // For the reciprocal estimates, convergence is quadratic, so the number
10325 // of digits is doubled after each iteration. In ARMv8, the accuracy of
10326 // the initial estimate is 2^-8. Thus the number of extra steps to refine
10327 // the result for float (23 mantissa bits) is 2 and for double (52
10328 // mantissa bits) is 3.
10329 ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
10330
10331 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
10332 }
10333
10334 return SDValue();
10335}
10336
10337SDValue
10338AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
10339 const DenormalMode &Mode) const {
10340 SDLoc DL(Op);
10341 EVT VT = Op.getValueType();
10342 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
10343 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
10344 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
10345}
10346
10347SDValue
10348AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
10349 SelectionDAG &DAG) const {
10350 return Op;
10351}
10352
10353SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
10354 SelectionDAG &DAG, int Enabled,
10355 int &ExtraSteps,
10356 bool &UseOneConst,
10357 bool Reciprocal) const {
10359 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
10360 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
10361 DAG, ExtraSteps)) {
10362 SDLoc DL(Operand);
10363 EVT VT = Operand.getValueType();
10364
10366 Flags.setAllowReassociation(true);
10367
10368 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
10369 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
10370 for (int i = ExtraSteps; i > 0; --i) {
10371 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
10372 Flags);
10373 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
10374 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10375 }
10376 if (!Reciprocal)
10377 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
10378
10379 ExtraSteps = 0;
10380 return Estimate;
10381 }
10382
10383 return SDValue();
10384}
10385
10386SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
10387 SelectionDAG &DAG, int Enabled,
10388 int &ExtraSteps) const {
10390 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
10391 DAG, ExtraSteps)) {
10392 SDLoc DL(Operand);
10393 EVT VT = Operand.getValueType();
10394
10396 Flags.setAllowReassociation(true);
10397
10398 // Newton reciprocal iteration: E * (2 - X * E)
10399 // AArch64 reciprocal iteration instruction: (2 - M * N)
10400 for (int i = ExtraSteps; i > 0; --i) {
10401 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
10402 Estimate, Flags);
10403 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10404 }
10405
10406 ExtraSteps = 0;
10407 return Estimate;
10408 }
10409
10410 return SDValue();
10411}
10412
10413//===----------------------------------------------------------------------===//
10414// AArch64 Inline Assembly Support
10415//===----------------------------------------------------------------------===//
10416
10417// Table of Constraints
10418// TODO: This is the current set of constraints supported by ARM for the
10419// compiler, not all of them may make sense.
10420//
10421// r - A general register
10422// w - An FP/SIMD register of some size in the range v0-v31
10423// x - An FP/SIMD register of some size in the range v0-v15
10424// I - Constant that can be used with an ADD instruction
10425// J - Constant that can be used with a SUB instruction
10426// K - Constant that can be used with a 32-bit logical instruction
10427// L - Constant that can be used with a 64-bit logical instruction
10428// M - Constant that can be used as a 32-bit MOV immediate
10429// N - Constant that can be used as a 64-bit MOV immediate
10430// Q - A memory reference with base register and no offset
10431// S - A symbolic address
10432// Y - Floating point constant zero
10433// Z - Integer constant zero
10434//
10435// Note that general register operands will be output using their 64-bit x
10436// register name, whatever the size of the variable, unless the asm operand
10437// is prefixed by the %w modifier. Floating-point and SIMD register operands
10438// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
10439// %q modifier.
10440const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
10441 // At this point, we have to lower this constraint to something else, so we
10442 // lower it to an "r" or "w". However, by doing this we will force the result
10443 // to be in register, while the X constraint is much more permissive.
10444 //
10445 // Although we are correct (we are free to emit anything, without
10446 // constraints), we might break use cases that would expect us to be more
10447 // efficient and emit something else.
10448 if (!Subtarget->hasFPARMv8())
10449 return "r";
10450
10451 if (ConstraintVT.isFloatingPoint())
10452 return "w";
10453
10454 if (ConstraintVT.isVector() &&
10455 (ConstraintVT.getSizeInBits() == 64 ||
10456 ConstraintVT.getSizeInBits() == 128))
10457 return "w";
10458
10459 return "r";
10460}
10461
10463
10464static std::optional<PredicateConstraint>
10467 .Case("Uph", PredicateConstraint::Uph)
10468 .Case("Upl", PredicateConstraint::Upl)
10469 .Case("Upa", PredicateConstraint::Upa)
10470 .Default(std::nullopt);
10471}
10472
10473static const TargetRegisterClass *
10475 if (VT != MVT::aarch64svcount &&
10476 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
10477 return nullptr;
10478
10479 switch (Constraint) {
10480 case PredicateConstraint::Uph:
10481 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
10482 : &AArch64::PPR_p8to15RegClass;
10483 case PredicateConstraint::Upl:
10484 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
10485 : &AArch64::PPR_3bRegClass;
10486 case PredicateConstraint::Upa:
10487 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
10488 : &AArch64::PPRRegClass;
10489 }
10490
10491 llvm_unreachable("Missing PredicateConstraint!");
10492}
10493
10495
10496static std::optional<ReducedGprConstraint>
10499 .Case("Uci", ReducedGprConstraint::Uci)
10500 .Case("Ucj", ReducedGprConstraint::Ucj)
10501 .Default(std::nullopt);
10502}
10503
10504static const TargetRegisterClass *
10506 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
10507 return nullptr;
10508
10509 switch (Constraint) {
10510 case ReducedGprConstraint::Uci:
10511 return &AArch64::MatrixIndexGPR32_8_11RegClass;
10512 case ReducedGprConstraint::Ucj:
10513 return &AArch64::MatrixIndexGPR32_12_15RegClass;
10514 }
10515
10516 llvm_unreachable("Missing ReducedGprConstraint!");
10517}
10518
10519// The set of cc code supported is from
10520// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
10523 .Case("{@cchi}", AArch64CC::HI)
10524 .Case("{@cccs}", AArch64CC::HS)
10525 .Case("{@cclo}", AArch64CC::LO)
10526 .Case("{@ccls}", AArch64CC::LS)
10527 .Case("{@cccc}", AArch64CC::LO)
10528 .Case("{@cceq}", AArch64CC::EQ)
10529 .Case("{@ccgt}", AArch64CC::GT)
10530 .Case("{@ccge}", AArch64CC::GE)
10531 .Case("{@cclt}", AArch64CC::LT)
10532 .Case("{@ccle}", AArch64CC::LE)
10533 .Case("{@cchs}", AArch64CC::HS)
10534 .Case("{@ccne}", AArch64CC::NE)
10535 .Case("{@ccvc}", AArch64CC::VC)
10536 .Case("{@ccpl}", AArch64CC::PL)
10537 .Case("{@ccvs}", AArch64CC::VS)
10538 .Case("{@ccmi}", AArch64CC::MI)
10540 return Cond;
10541}
10542
10543/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
10544/// WZR, invert(<cond>)'.
10546 SelectionDAG &DAG) {
10547 return DAG.getNode(
10548 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
10549 DAG.getConstant(0, DL, MVT::i32),
10550 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
10551}
10552
10553// Lower @cc flag output via getSETCC.
10554SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
10555 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
10556 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
10557 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
10558 if (Cond == AArch64CC::Invalid)
10559 return SDValue();
10560 // The output variable should be a scalar integer.
10561 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
10562 OpInfo.ConstraintVT.getSizeInBits() < 8)
10563 report_fatal_error("Flag output operand is of invalid type");
10564
10565 // Get NZCV register. Only update chain when copyfrom is glued.
10566 if (Glue.getNode()) {
10567 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
10568 Chain = Glue.getValue(1);
10569 } else
10570 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
10571 // Extract CC code.
10572 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
10573
10575
10576 // Truncate or ZERO_EXTEND based on value types.
10577 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
10578 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
10579 else
10580 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
10581
10582 return Result;
10583}
10584
10585/// getConstraintType - Given a constraint letter, return the type of
10586/// constraint it is for this target.
10588AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
10589 if (Constraint.size() == 1) {
10590 switch (Constraint[0]) {
10591 default:
10592 break;
10593 case 'x':
10594 case 'w':
10595 case 'y':
10596 return C_RegisterClass;
10597 // An address with a single base register. Due to the way we
10598 // currently handle addresses it is the same as 'r'.
10599 case 'Q':
10600 return C_Memory;
10601 case 'I':
10602 case 'J':
10603 case 'K':
10604 case 'L':
10605 case 'M':
10606 case 'N':
10607 case 'Y':
10608 case 'Z':
10609 return C_Immediate;
10610 case 'z':
10611 case 'S': // A symbolic address
10612 return C_Other;
10613 }
10614 } else if (parsePredicateConstraint(Constraint))
10615 return C_RegisterClass;
10616 else if (parseReducedGprConstraint(Constraint))
10617 return C_RegisterClass;
10618 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
10619 return C_Other;
10620 return TargetLowering::getConstraintType(Constraint);
10621}
10622
10623/// Examine constraint type and operand type and determine a weight value.
10624/// This object must already have been set up with the operand type
10625/// and the current alternative constraint selected.
10627AArch64TargetLowering::getSingleConstraintMatchWeight(
10628 AsmOperandInfo &info, const char *constraint) const {
10630 Value *CallOperandVal = info.CallOperandVal;
10631 // If we don't have a value, we can't do a match,
10632 // but allow it at the lowest weight.
10633 if (!CallOperandVal)
10634 return CW_Default;
10635 Type *type = CallOperandVal->getType();
10636 // Look at the constraint type.
10637 switch (*constraint) {
10638 default:
10640 break;
10641 case 'x':
10642 case 'w':
10643 case 'y':
10644 if (type->isFloatingPointTy() || type->isVectorTy())
10645 weight = CW_Register;
10646 break;
10647 case 'z':
10648 weight = CW_Constant;
10649 break;
10650 case 'U':
10651 if (parsePredicateConstraint(constraint) ||
10652 parseReducedGprConstraint(constraint))
10653 weight = CW_Register;
10654 break;
10655 }
10656 return weight;
10657}
10658
10659std::pair<unsigned, const TargetRegisterClass *>
10660AArch64TargetLowering::getRegForInlineAsmConstraint(
10661 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
10662 if (Constraint.size() == 1) {
10663 switch (Constraint[0]) {
10664 case 'r':
10665 if (VT.isScalableVector())
10666 return std::make_pair(0U, nullptr);
10667 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
10668 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
10669 if (VT.getFixedSizeInBits() == 64)
10670 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
10671 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
10672 case 'w': {
10673 if (!Subtarget->hasFPARMv8())
10674 break;
10675 if (VT.isScalableVector()) {
10676 if (VT.getVectorElementType() != MVT::i1)
10677 return std::make_pair(0U, &AArch64::ZPRRegClass);
10678 return std::make_pair(0U, nullptr);
10679 }
10680 uint64_t VTSize = VT.getFixedSizeInBits();
10681 if (VTSize == 16)
10682 return std::make_pair(0U, &AArch64::FPR16RegClass);
10683 if (VTSize == 32)
10684 return std::make_pair(0U, &AArch64::FPR32RegClass);
10685 if (VTSize == 64)
10686 return std::make_pair(0U, &AArch64::FPR64RegClass);
10687 if (VTSize == 128)
10688 return std::make_pair(0U, &AArch64::FPR128RegClass);
10689 break;
10690 }
10691 // The instructions that this constraint is designed for can
10692 // only take 128-bit registers so just use that regclass.
10693 case 'x':
10694 if (!Subtarget->hasFPARMv8())
10695 break;
10696 if (VT.isScalableVector())
10697 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
10698 if (VT.getSizeInBits() == 128)
10699 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
10700 break;
10701 case 'y':
10702 if (!Subtarget->hasFPARMv8())
10703 break;
10704 if (VT.isScalableVector())
10705 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
10706 break;
10707 }
10708 } else {
10709 if (const auto PC = parsePredicateConstraint(Constraint))
10710 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
10711 return std::make_pair(0U, RegClass);
10712
10713 if (const auto RGC = parseReducedGprConstraint(Constraint))
10714 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
10715 return std::make_pair(0U, RegClass);
10716 }
10717 if (StringRef("{cc}").equals_insensitive(Constraint) ||
10719 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
10720
10721 if (Constraint == "{za}") {
10722 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
10723 }
10724
10725 if (Constraint == "{zt0}") {
10726 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
10727 }
10728
10729 // Use the default implementation in TargetLowering to convert the register
10730 // constraint into a member of a register class.
10731 std::pair<unsigned, const TargetRegisterClass *> Res;
10733
10734 // Not found as a standard register?
10735 if (!Res.second) {
10736 unsigned Size = Constraint.size();
10737 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
10738 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
10739 int RegNo;
10740 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
10741 if (!Failed && RegNo >= 0 && RegNo <= 31) {
10742 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
10743 // By default we'll emit v0-v31 for this unless there's a modifier where
10744 // we'll emit the correct register as well.
10745 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
10746 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
10747 Res.second = &AArch64::FPR64RegClass;
10748 } else {
10749 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
10750 Res.second = &AArch64::FPR128RegClass;
10751 }
10752 }
10753 }
10754 }
10755
10756 if (Res.second && !Subtarget->hasFPARMv8() &&
10757 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
10758 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
10759 return std::make_pair(0U, nullptr);
10760
10761 return Res;
10762}
10763
10765 llvm::Type *Ty,
10766 bool AllowUnknown) const {
10767 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
10768 return EVT(MVT::i64x8);
10769
10770 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
10771}
10772
10773/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
10774/// vector. If it is invalid, don't add anything to Ops.
10775void AArch64TargetLowering::LowerAsmOperandForConstraint(
10776 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
10777 SelectionDAG &DAG) const {
10778 SDValue Result;
10779
10780 // Currently only support length 1 constraints.
10781 if (Constraint.size() != 1)
10782 return;
10783
10784 char ConstraintLetter = Constraint[0];
10785 switch (ConstraintLetter) {
10786 default:
10787 break;
10788
10789 // This set of constraints deal with valid constants for various instructions.
10790 // Validate and return a target constant for them if we can.
10791 case 'z': {
10792 // 'z' maps to xzr or wzr so it needs an input of 0.
10793 if (!isNullConstant(Op))
10794 return;
10795
10796 if (Op.getValueType() == MVT::i64)
10797 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
10798 else
10799 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
10800 break;
10801 }
10802 case 'S': {
10803 // An absolute symbolic address or label reference.
10804 if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
10805 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
10806 GA->getValueType(0));
10807 } else if (const BlockAddressSDNode *BA =
10808 dyn_cast<BlockAddressSDNode>(Op)) {
10809 Result =
10811 } else
10812 return;
10813 break;
10814 }
10815
10816 case 'I':
10817 case 'J':
10818 case 'K':
10819 case 'L':
10820 case 'M':
10821 case 'N':
10822 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
10823 if (!C)
10824 return;
10825
10826 // Grab the value and do some validation.
10827 uint64_t CVal = C->getZExtValue();
10828 switch (ConstraintLetter) {
10829 // The I constraint applies only to simple ADD or SUB immediate operands:
10830 // i.e. 0 to 4095 with optional shift by 12
10831 // The J constraint applies only to ADD or SUB immediates that would be
10832 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
10833 // instruction [or vice versa], in other words -1 to -4095 with optional
10834 // left shift by 12.
10835 case 'I':
10836 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
10837 break;
10838 return;
10839 case 'J': {
10840 uint64_t NVal = -C->getSExtValue();
10841 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
10842 CVal = C->getSExtValue();
10843 break;
10844 }
10845 return;
10846 }
10847 // The K and L constraints apply *only* to logical immediates, including
10848 // what used to be the MOVI alias for ORR (though the MOVI alias has now
10849 // been removed and MOV should be used). So these constraints have to
10850 // distinguish between bit patterns that are valid 32-bit or 64-bit
10851 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
10852 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
10853 // versa.
10854 case 'K':
10855 if (AArch64_AM::isLogicalImmediate(CVal, 32))
10856 break;
10857 return;
10858 case 'L':
10859 if (AArch64_AM::isLogicalImmediate(CVal, 64))
10860 break;
10861 return;
10862 // The M and N constraints are a superset of K and L respectively, for use
10863 // with the MOV (immediate) alias. As well as the logical immediates they
10864 // also match 32 or 64-bit immediates that can be loaded either using a
10865 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
10866 // (M) or 64-bit 0x1234000000000000 (N) etc.
10867 // As a note some of this code is liberally stolen from the asm parser.
10868 case 'M': {
10869 if (!isUInt<32>(CVal))
10870 return;
10871 if (AArch64_AM::isLogicalImmediate(CVal, 32))
10872 break;
10873 if ((CVal & 0xFFFF) == CVal)
10874 break;
10875 if ((CVal & 0xFFFF0000ULL) == CVal)
10876 break;
10877 uint64_t NCVal = ~(uint32_t)CVal;
10878 if ((NCVal & 0xFFFFULL) == NCVal)
10879 break;
10880 if ((NCVal & 0xFFFF0000ULL) == NCVal)
10881 break;
10882 return;
10883 }
10884 case 'N': {
10885 if (AArch64_AM::isLogicalImmediate(CVal, 64))
10886 break;
10887 if ((CVal & 0xFFFFULL) == CVal)
10888 break;
10889 if ((CVal & 0xFFFF0000ULL) == CVal)
10890 break;
10891 if ((CVal & 0xFFFF00000000ULL) == CVal)
10892 break;
10893 if ((CVal & 0xFFFF000000000000ULL) == CVal)
10894 break;
10895 uint64_t NCVal = ~CVal;
10896 if ((NCVal & 0xFFFFULL) == NCVal)
10897 break;
10898 if ((NCVal & 0xFFFF0000ULL) == NCVal)
10899 break;
10900 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
10901 break;
10902 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
10903 break;
10904 return;
10905 }
10906 default:
10907 return;
10908 }
10909
10910 // All assembler immediates are 64-bit integers.
10911 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
10912 break;
10913 }
10914
10915 if (Result.getNode()) {
10916 Ops.push_back(Result);
10917 return;
10918 }
10919
10920 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
10921}
10922
10923//===----------------------------------------------------------------------===//
10924// AArch64 Advanced SIMD Support
10925//===----------------------------------------------------------------------===//
10926
10927/// WidenVector - Given a value in the V64 register class, produce the
10928/// equivalent value in the V128 register class.
10930 EVT VT = V64Reg.getValueType();
10931 unsigned NarrowSize = VT.getVectorNumElements();
10932 MVT EltTy = VT.getVectorElementType().getSimpleVT();
10933 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
10934 SDLoc DL(V64Reg);
10935
10936 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
10937 V64Reg, DAG.getConstant(0, DL, MVT::i64));
10938}
10939
10940/// getExtFactor - Determine the adjustment factor for the position when
10941/// generating an "extract from vector registers" instruction.
10942static unsigned getExtFactor(SDValue &V) {
10943 EVT EltType = V.getValueType().getVectorElementType();
10944 return EltType.getSizeInBits() / 8;
10945}
10946
10947// Check if a vector is built from one vector via extracted elements of
10948// another together with an AND mask, ensuring that all elements fit
10949// within range. This can be reconstructed using AND and NEON's TBL1.
10951 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
10952 SDLoc dl(Op);
10953 EVT VT = Op.getValueType();
10954 assert(!VT.isScalableVector() &&
10955 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
10956
10957 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
10958 // directly to TBL1.
10959 if (VT != MVT::v16i8 && VT != MVT::v8i8)
10960 return SDValue();
10961
10962 unsigned NumElts = VT.getVectorNumElements();
10963 assert((NumElts == 8 || NumElts == 16) &&
10964 "Need to have exactly 8 or 16 elements in vector.");
10965
10966 SDValue SourceVec;
10967 SDValue MaskSourceVec;
10968 SmallVector<SDValue, 16> AndMaskConstants;
10969
10970 for (unsigned i = 0; i < NumElts; ++i) {
10971 SDValue V = Op.getOperand(i);
10972 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10973 return SDValue();
10974
10975 SDValue OperandSourceVec = V.getOperand(0);
10976 if (!SourceVec)
10977 SourceVec = OperandSourceVec;
10978 else if (SourceVec != OperandSourceVec)
10979 return SDValue();
10980
10981 // This only looks at shuffles with elements that are
10982 // a) truncated by a constant AND mask extracted from a mask vector, or
10983 // b) extracted directly from a mask vector.
10984 SDValue MaskSource = V.getOperand(1);
10985 if (MaskSource.getOpcode() == ISD::AND) {
10986 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
10987 return SDValue();
10988
10989 AndMaskConstants.push_back(MaskSource.getOperand(1));
10990 MaskSource = MaskSource->getOperand(0);
10991 } else if (!AndMaskConstants.empty()) {
10992 // Either all or no operands should have an AND mask.
10993 return SDValue();
10994 }
10995
10996 // An ANY_EXTEND may be inserted between the AND and the source vector
10997 // extraction. We don't care about that, so we can just skip it.
10998 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
10999 MaskSource = MaskSource.getOperand(0);
11000
11001 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11002 return SDValue();
11003
11004 SDValue MaskIdx = MaskSource.getOperand(1);
11005 if (!isa<ConstantSDNode>(MaskIdx) ||
11006 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
11007 return SDValue();
11008
11009 // We only apply this if all elements come from the same vector with the
11010 // same vector type.
11011 if (!MaskSourceVec) {
11012 MaskSourceVec = MaskSource->getOperand(0);
11013 if (MaskSourceVec.getValueType() != VT)
11014 return SDValue();
11015 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
11016 return SDValue();
11017 }
11018 }
11019
11020 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
11021 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11022 // insert, we know that the index in the mask must be smaller than the number
11023 // of elements in the source, or we would have an out-of-bounds access.
11024 if (NumElts == 8)
11025 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11026 DAG.getUNDEF(VT));
11027
11028 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11029 if (!AndMaskConstants.empty())
11030 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
11031 DAG.getBuildVector(VT, dl, AndMaskConstants));
11032
11033 return DAG.getNode(
11035 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11036 MaskSourceVec);
11037}
11038
11039// Gather data to see if the operation can be modelled as a
11040// shuffle in combination with VEXTs.
11042 SelectionDAG &DAG) const {
11043 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11044 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11045 SDLoc dl(Op);
11046 EVT VT = Op.getValueType();
11047 assert(!VT.isScalableVector() &&
11048 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11049 unsigned NumElts = VT.getVectorNumElements();
11050
11051 struct ShuffleSourceInfo {
11052 SDValue Vec;
11053 unsigned MinElt;
11054 unsigned MaxElt;
11055
11056 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11057 // be compatible with the shuffle we intend to construct. As a result
11058 // ShuffleVec will be some sliding window into the original Vec.
11059 SDValue ShuffleVec;
11060
11061 // Code should guarantee that element i in Vec starts at element "WindowBase
11062 // + i * WindowScale in ShuffleVec".
11063 int WindowBase;
11064 int WindowScale;
11065
11066 ShuffleSourceInfo(SDValue Vec)
11067 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
11068 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
11069
11070 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11071 };
11072
11073 // First gather all vectors used as an immediate source for this BUILD_VECTOR
11074 // node.
11076 for (unsigned i = 0; i < NumElts; ++i) {
11077 SDValue V = Op.getOperand(i);
11078 if (V.isUndef())
11079 continue;
11080 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11081 !isa<ConstantSDNode>(V.getOperand(1)) ||
11082 V.getOperand(0).getValueType().isScalableVector()) {
11083 LLVM_DEBUG(
11084 dbgs() << "Reshuffle failed: "
11085 "a shuffle can only come from building a vector from "
11086 "various elements of other fixed-width vectors, provided "
11087 "their indices are constant\n");
11088 return SDValue();
11089 }
11090
11091 // Add this element source to the list if it's not already there.
11092 SDValue SourceVec = V.getOperand(0);
11093 auto Source = find(Sources, SourceVec);
11094 if (Source == Sources.end())
11095 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
11096
11097 // Update the minimum and maximum lane number seen.
11098 unsigned EltNo = V.getConstantOperandVal(1);
11099 Source->MinElt = std::min(Source->MinElt, EltNo);
11100 Source->MaxElt = std::max(Source->MaxElt, EltNo);
11101 }
11102
11103 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
11104 // better than moving to/from gpr registers for larger vectors.
11105 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
11106 // Construct a mask for the tbl. We may need to adjust the index for types
11107 // larger than i8.
11109 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
11110 for (unsigned I = 0; I < NumElts; ++I) {
11111 SDValue V = Op.getOperand(I);
11112 if (V.isUndef()) {
11113 for (unsigned OF = 0; OF < OutputFactor; OF++)
11114 Mask.push_back(-1);
11115 continue;
11116 }
11117 // Set the Mask lanes adjusted for the size of the input and output
11118 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
11119 // output element, adjusted in their positions per input and output types.
11120 unsigned Lane = V.getConstantOperandVal(1);
11121 for (unsigned S = 0; S < Sources.size(); S++) {
11122 if (V.getOperand(0) == Sources[S].Vec) {
11123 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
11124 unsigned InputBase = 16 * S + Lane * InputSize / 8;
11125 for (unsigned OF = 0; OF < OutputFactor; OF++)
11126 Mask.push_back(InputBase + OF);
11127 break;
11128 }
11129 }
11130 }
11131
11132 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11133 // v16i8, and the TBLMask
11134 SmallVector<SDValue, 16> TBLOperands;
11135 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
11136 ? Intrinsic::aarch64_neon_tbl3
11137 : Intrinsic::aarch64_neon_tbl4,
11138 dl, MVT::i32));
11139 for (unsigned i = 0; i < Sources.size(); i++) {
11140 SDValue Src = Sources[i].Vec;
11141 EVT SrcVT = Src.getValueType();
11142 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11143 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
11144 "Expected a legally typed vector");
11145 if (SrcVT.is64BitVector())
11146 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11147 DAG.getUNDEF(MVT::v8i8));
11148 TBLOperands.push_back(Src);
11149 }
11150
11152 for (unsigned i = 0; i < Mask.size(); i++)
11153 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11154 assert((Mask.size() == 8 || Mask.size() == 16) &&
11155 "Expected a v8i8 or v16i8 Mask");
11156 TBLOperands.push_back(
11157 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11158
11159 SDValue Shuffle =
11161 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11162 return DAG.getBitcast(VT, Shuffle);
11163 }
11164
11165 if (Sources.size() > 2) {
11166 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11167 << "sensible when at most two source vectors are "
11168 << "involved\n");
11169 return SDValue();
11170 }
11171
11172 // Find out the smallest element size among result and two sources, and use
11173 // it as element size to build the shuffle_vector.
11174 EVT SmallestEltTy = VT.getVectorElementType();
11175 for (auto &Source : Sources) {
11176 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11177 if (SrcEltTy.bitsLT(SmallestEltTy)) {
11178 SmallestEltTy = SrcEltTy;
11179 }
11180 }
11181 unsigned ResMultiplier =
11182 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11183 uint64_t VTSize = VT.getFixedSizeInBits();
11184 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11185 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
11186
11187 // If the source vector is too wide or too narrow, we may nevertheless be able
11188 // to construct a compatible shuffle either by concatenating it with UNDEF or
11189 // extracting a suitable range of elements.
11190 for (auto &Src : Sources) {
11191 EVT SrcVT = Src.ShuffleVec.getValueType();
11192
11193 TypeSize SrcVTSize = SrcVT.getSizeInBits();
11194 if (SrcVTSize == TypeSize::getFixed(VTSize))
11195 continue;
11196
11197 // This stage of the search produces a source with the same element type as
11198 // the original, but with a total width matching the BUILD_VECTOR output.
11199 EVT EltVT = SrcVT.getVectorElementType();
11200 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11201 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
11202
11203 if (SrcVTSize.getFixedValue() < VTSize) {
11204 assert(2 * SrcVTSize == VTSize);
11205 // We can pad out the smaller vector for free, so if it's part of a
11206 // shuffle...
11207 Src.ShuffleVec =
11208 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
11209 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
11210 continue;
11211 }
11212
11213 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
11214 LLVM_DEBUG(
11215 dbgs() << "Reshuffle failed: result vector too small to extract\n");
11216 return SDValue();
11217 }
11218
11219 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11220 LLVM_DEBUG(
11221 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11222 return SDValue();
11223 }
11224
11225 if (Src.MinElt >= NumSrcElts) {
11226 // The extraction can just take the second half
11227 Src.ShuffleVec =
11228 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11229 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11230 Src.WindowBase = -NumSrcElts;
11231 } else if (Src.MaxElt < NumSrcElts) {
11232 // The extraction can just take the first half
11233 Src.ShuffleVec =
11234 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11235 DAG.getConstant(0, dl, MVT::i64));
11236 } else {
11237 // An actual VEXT is needed
11238 SDValue VEXTSrc1 =
11239 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11240 DAG.getConstant(0, dl, MVT::i64));
11241 SDValue VEXTSrc2 =
11242 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11243 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11244 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
11245
11246 if (!SrcVT.is64BitVector()) {
11247 LLVM_DEBUG(
11248 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
11249 "for SVE vectors.");
11250 return SDValue();
11251 }
11252
11253 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
11254 VEXTSrc2,
11255 DAG.getConstant(Imm, dl, MVT::i32));
11256 Src.WindowBase = -Src.MinElt;
11257 }
11258 }
11259
11260 // Another possible incompatibility occurs from the vector element types. We
11261 // can fix this by bitcasting the source vectors to the same type we intend
11262 // for the shuffle.
11263 for (auto &Src : Sources) {
11264 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
11265 if (SrcEltTy == SmallestEltTy)
11266 continue;
11267 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
11268 if (DAG.getDataLayout().isBigEndian()) {
11269 Src.ShuffleVec =
11270 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
11271 } else {
11272 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
11273 }
11274 Src.WindowScale =
11275 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11276 Src.WindowBase *= Src.WindowScale;
11277 }
11278
11279 // Final check before we try to actually produce a shuffle.
11280 LLVM_DEBUG(for (auto Src
11281 : Sources)
11282 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
11283
11284 // The stars all align, our next step is to produce the mask for the shuffle.
11285 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
11286 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
11287 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
11288 SDValue Entry = Op.getOperand(i);
11289 if (Entry.isUndef())
11290 continue;
11291
11292 auto Src = find(Sources, Entry.getOperand(0));
11293 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
11294
11295 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
11296 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
11297 // segment.
11298 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
11299 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
11300 VT.getScalarSizeInBits());
11301 int LanesDefined = BitsDefined / BitsPerShuffleLane;
11302
11303 // This source is expected to fill ResMultiplier lanes of the final shuffle,
11304 // starting at the appropriate offset.
11305 int *LaneMask = &Mask[i * ResMultiplier];
11306
11307 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
11308 ExtractBase += NumElts * (Src - Sources.begin());
11309 for (int j = 0; j < LanesDefined; ++j)
11310 LaneMask[j] = ExtractBase + j;
11311 }
11312
11313 // Final check before we try to produce nonsense...
11314 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
11315 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
11316 return SDValue();
11317 }
11318
11319 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
11320 for (unsigned i = 0; i < Sources.size(); ++i)
11321 ShuffleOps[i] = Sources[i].ShuffleVec;
11322
11323 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
11324 ShuffleOps[1], Mask);
11325 SDValue V;
11326 if (DAG.getDataLayout().isBigEndian()) {
11327 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
11328 } else {
11329 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
11330 }
11331
11332 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
11333 dbgs() << "Reshuffle, creating node: "; V.dump(););
11334
11335 return V;
11336}
11337
11338// check if an EXT instruction can handle the shuffle mask when the
11339// vector sources of the shuffle are the same.
11340static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
11341 unsigned NumElts = VT.getVectorNumElements();
11342
11343 // Assume that the first shuffle index is not UNDEF. Fail if it is.
11344 if (M[0] < 0)
11345 return false;
11346
11347 Imm = M[0];
11348
11349 // If this is a VEXT shuffle, the immediate value is the index of the first
11350 // element. The other shuffle indices must be the successive elements after
11351 // the first one.
11352 unsigned ExpectedElt = Imm;
11353 for (unsigned i = 1; i < NumElts; ++i) {
11354 // Increment the expected index. If it wraps around, just follow it
11355 // back to index zero and keep going.
11356 ++ExpectedElt;
11357 if (ExpectedElt == NumElts)
11358 ExpectedElt = 0;
11359
11360 if (M[i] < 0)
11361 continue; // ignore UNDEF indices
11362 if (ExpectedElt != static_cast<unsigned>(M[i]))
11363 return false;
11364 }
11365
11366 return true;
11367}
11368
11369// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
11370// v4i32s. This is really a truncate, which we can construct out of (legal)
11371// concats and truncate nodes.
11373 if (V.getValueType() != MVT::v16i8)
11374 return SDValue();
11375 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
11376
11377 for (unsigned X = 0; X < 4; X++) {
11378 // Check the first item in each group is an extract from lane 0 of a v4i32
11379 // or v4i16.
11380 SDValue BaseExt = V.getOperand(X * 4);
11381 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11382 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
11383 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
11384 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
11385 BaseExt.getConstantOperandVal(1) != 0)
11386 return SDValue();
11387 SDValue Base = BaseExt.getOperand(0);
11388 // And check the other items are extracts from the same vector.
11389 for (unsigned Y = 1; Y < 4; Y++) {
11390 SDValue Ext = V.getOperand(X * 4 + Y);
11391 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11392 Ext.getOperand(0) != Base ||
11393 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
11394 Ext.getConstantOperandVal(1) != Y)
11395 return SDValue();
11396 }
11397 }
11398
11399 // Turn the buildvector into a series of truncates and concates, which will
11400 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
11401 // concat together to produce 2 v8i16. These are both truncated and concat
11402 // together.
11403 SDLoc DL(V);
11404 SDValue Trunc[4] = {
11405 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
11406 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
11407 for (SDValue &V : Trunc)
11408 if (V.getValueType() == MVT::v4i32)
11409 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
11410 SDValue Concat0 =
11411 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
11412 SDValue Concat1 =
11413 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
11414 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
11415 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
11416 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
11417}
11418
11419/// Check if a vector shuffle corresponds to a DUP instructions with a larger
11420/// element width than the vector lane type. If that is the case the function
11421/// returns true and writes the value of the DUP instruction lane operand into
11422/// DupLaneOp
11423static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
11424 unsigned &DupLaneOp) {
11425 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
11426 "Only possible block sizes for wide DUP are: 16, 32, 64");
11427
11428 if (BlockSize <= VT.getScalarSizeInBits())
11429 return false;
11430 if (BlockSize % VT.getScalarSizeInBits() != 0)
11431 return false;
11432 if (VT.getSizeInBits() % BlockSize != 0)
11433 return false;
11434
11435 size_t SingleVecNumElements = VT.getVectorNumElements();
11436 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
11437 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
11438
11439 // We are looking for masks like
11440 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
11441 // might be replaced by 'undefined'. BlockIndices will eventually contain
11442 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
11443 // for the above examples)
11444 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
11445 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
11446 for (size_t I = 0; I < NumEltsPerBlock; I++) {
11447 int Elt = M[BlockIndex * NumEltsPerBlock + I];
11448 if (Elt < 0)
11449 continue;
11450 // For now we don't support shuffles that use the second operand
11451 if ((unsigned)Elt >= SingleVecNumElements)
11452 return false;
11453 if (BlockElts[I] < 0)
11454 BlockElts[I] = Elt;
11455 else if (BlockElts[I] != Elt)
11456 return false;
11457 }
11458
11459 // We found a candidate block (possibly with some undefs). It must be a
11460 // sequence of consecutive integers starting with a value divisible by
11461 // NumEltsPerBlock with some values possibly replaced by undef-s.
11462
11463 // Find first non-undef element
11464 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
11465 assert(FirstRealEltIter != BlockElts.end() &&
11466 "Shuffle with all-undefs must have been caught by previous cases, "
11467 "e.g. isSplat()");
11468 if (FirstRealEltIter == BlockElts.end()) {
11469 DupLaneOp = 0;
11470 return true;
11471 }
11472
11473 // Index of FirstRealElt in BlockElts
11474 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
11475
11476 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
11477 return false;
11478 // BlockElts[0] must have the following value if it isn't undef:
11479 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
11480
11481 // Check the first element
11482 if (Elt0 % NumEltsPerBlock != 0)
11483 return false;
11484 // Check that the sequence indeed consists of consecutive integers (modulo
11485 // undefs)
11486 for (size_t I = 0; I < NumEltsPerBlock; I++)
11487 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
11488 return false;
11489
11490 DupLaneOp = Elt0 / NumEltsPerBlock;
11491 return true;
11492}
11493
11494// check if an EXT instruction can handle the shuffle mask when the
11495// vector sources of the shuffle are different.
11496static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
11497 unsigned &Imm) {
11498 // Look for the first non-undef element.
11499 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
11500
11501 // Benefit form APInt to handle overflow when calculating expected element.
11502 unsigned NumElts = VT.getVectorNumElements();
11503 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
11504 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
11505 // The following shuffle indices must be the successive elements after the
11506 // first real element.
11507 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
11508 return Elt != ExpectedElt++ && Elt != -1;
11509 });
11510 if (FoundWrongElt)
11511 return false;
11512
11513 // The index of an EXT is the first element if it is not UNDEF.
11514 // Watch out for the beginning UNDEFs. The EXT index should be the expected
11515 // value of the first element. E.g.
11516 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
11517 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
11518 // ExpectedElt is the last mask index plus 1.
11519 Imm = ExpectedElt.getZExtValue();
11520
11521 // There are two difference cases requiring to reverse input vectors.
11522 // For example, for vector <4 x i32> we have the following cases,
11523 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
11524 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
11525 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
11526 // to reverse two input vectors.
11527 if (Imm < NumElts)
11528 ReverseEXT = true;
11529 else
11530 Imm -= NumElts;
11531
11532 return true;
11533}
11534
11535/// isREVMask - Check if a vector shuffle corresponds to a REV
11536/// instruction with the specified blocksize. (The order of the elements
11537/// within each block of the vector is reversed.)
11538static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
11539 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 ||
11540 BlockSize == 128) &&
11541 "Only possible block sizes for REV are: 16, 32, 64, 128");
11542
11543 unsigned EltSz = VT.getScalarSizeInBits();
11544 unsigned NumElts = VT.getVectorNumElements();
11545 unsigned BlockElts = M[0] + 1;
11546 // If the first shuffle index is UNDEF, be optimistic.
11547 if (M[0] < 0)
11548 BlockElts = BlockSize / EltSz;
11549
11550 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
11551 return false;
11552
11553 for (unsigned i = 0; i < NumElts; ++i) {
11554 if (M[i] < 0)
11555 continue; // ignore UNDEF indices
11556 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
11557 return false;
11558 }
11559
11560 return true;
11561}
11562
11563static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11564 unsigned NumElts = VT.getVectorNumElements();
11565 if (NumElts % 2 != 0)
11566 return false;
11567 WhichResult = (M[0] == 0 ? 0 : 1);
11568 unsigned Idx = WhichResult * NumElts / 2;
11569 for (unsigned i = 0; i != NumElts; i += 2) {
11570 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11571 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
11572 return false;
11573 Idx += 1;
11574 }
11575
11576 return true;
11577}
11578
11579static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11580 unsigned NumElts = VT.getVectorNumElements();
11581 WhichResult = (M[0] == 0 ? 0 : 1);
11582 for (unsigned i = 0; i != NumElts; ++i) {
11583 if (M[i] < 0)
11584 continue; // ignore UNDEF indices
11585 if ((unsigned)M[i] != 2 * i + WhichResult)
11586 return false;
11587 }
11588
11589 return true;
11590}
11591
11592static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11593 unsigned NumElts = VT.getVectorNumElements();
11594 if (NumElts % 2 != 0)
11595 return false;
11596 WhichResult = (M[0] == 0 ? 0 : 1);
11597 for (unsigned i = 0; i < NumElts; i += 2) {
11598 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11599 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
11600 return false;
11601 }
11602 return true;
11603}
11604
11605/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
11606/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11607/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
11608static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11609 unsigned NumElts = VT.getVectorNumElements();
11610 if (NumElts % 2 != 0)
11611 return false;
11612 WhichResult = (M[0] == 0 ? 0 : 1);
11613 unsigned Idx = WhichResult * NumElts / 2;
11614 for (unsigned i = 0; i != NumElts; i += 2) {
11615 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11616 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
11617 return false;
11618 Idx += 1;
11619 }
11620
11621 return true;
11622}
11623
11624/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
11625/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11626/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
11627static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11628 unsigned Half = VT.getVectorNumElements() / 2;
11629 WhichResult = (M[0] == 0 ? 0 : 1);
11630 for (unsigned j = 0; j != 2; ++j) {
11631 unsigned Idx = WhichResult;
11632 for (unsigned i = 0; i != Half; ++i) {
11633 int MIdx = M[i + j * Half];
11634 if (MIdx >= 0 && (unsigned)MIdx != Idx)
11635 return false;
11636 Idx += 2;
11637 }
11638 }
11639
11640 return true;
11641}
11642
11643/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
11644/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11645/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
11646static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11647 unsigned NumElts = VT.getVectorNumElements();
11648 if (NumElts % 2 != 0)
11649 return false;
11650 WhichResult = (M[0] == 0 ? 0 : 1);
11651 for (unsigned i = 0; i < NumElts; i += 2) {
11652 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11653 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
11654 return false;
11655 }
11656 return true;
11657}
11658
11659static bool isINSMask(ArrayRef<int> M, int NumInputElements,
11660 bool &DstIsLeft, int &Anomaly) {
11661 if (M.size() != static_cast<size_t>(NumInputElements))
11662 return false;
11663
11664 int NumLHSMatch = 0, NumRHSMatch = 0;
11665 int LastLHSMismatch = -1, LastRHSMismatch = -1;
11666
11667 for (int i = 0; i < NumInputElements; ++i) {
11668 if (M[i] == -1) {
11669 ++NumLHSMatch;
11670 ++NumRHSMatch;
11671 continue;
11672 }
11673
11674 if (M[i] == i)
11675 ++NumLHSMatch;
11676 else
11677 LastLHSMismatch = i;
11678
11679 if (M[i] == i + NumInputElements)
11680 ++NumRHSMatch;
11681 else
11682 LastRHSMismatch = i;
11683 }
11684
11685 if (NumLHSMatch == NumInputElements - 1) {
11686 DstIsLeft = true;
11687 Anomaly = LastLHSMismatch;
11688 return true;
11689 } else if (NumRHSMatch == NumInputElements - 1) {
11690 DstIsLeft = false;
11691 Anomaly = LastRHSMismatch;
11692 return true;
11693 }
11694
11695 return false;
11696}
11697
11698static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
11699 if (VT.getSizeInBits() != 128)
11700 return false;
11701
11702 unsigned NumElts = VT.getVectorNumElements();
11703
11704 for (int I = 0, E = NumElts / 2; I != E; I++) {
11705 if (Mask[I] != I)
11706 return false;
11707 }
11708
11709 int Offset = NumElts / 2;
11710 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
11711 if (Mask[I] != I + SplitLHS * Offset)
11712 return false;
11713 }
11714
11715 return true;
11716}
11717
11719 SDLoc DL(Op);
11720 EVT VT = Op.getValueType();
11721 SDValue V0 = Op.getOperand(0);
11722 SDValue V1 = Op.getOperand(1);
11723 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
11724
11727 return SDValue();
11728
11729 bool SplitV0 = V0.getValueSizeInBits() == 128;
11730
11731 if (!isConcatMask(Mask, VT, SplitV0))
11732 return SDValue();
11733
11734 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
11735 if (SplitV0) {
11736 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
11737 DAG.getConstant(0, DL, MVT::i64));
11738 }
11739 if (V1.getValueSizeInBits() == 128) {
11740 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
11741 DAG.getConstant(0, DL, MVT::i64));
11742 }
11743 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
11744}
11745
11746/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
11747/// the specified operations to build the shuffle. ID is the perfect-shuffle
11748//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
11749//table entry and LHS/RHS are the immediate inputs for this stage of the
11750//shuffle.
11752 SDValue V2, unsigned PFEntry, SDValue LHS,
11753 SDValue RHS, SelectionDAG &DAG,
11754 const SDLoc &dl) {
11755 unsigned OpNum = (PFEntry >> 26) & 0x0F;
11756 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
11757 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
11758
11759 enum {
11760 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
11761 OP_VREV,
11762 OP_VDUP0,
11763 OP_VDUP1,
11764 OP_VDUP2,
11765 OP_VDUP3,
11766 OP_VEXT1,
11767 OP_VEXT2,
11768 OP_VEXT3,
11769 OP_VUZPL, // VUZP, left result
11770 OP_VUZPR, // VUZP, right result
11771 OP_VZIPL, // VZIP, left result
11772 OP_VZIPR, // VZIP, right result
11773 OP_VTRNL, // VTRN, left result
11774 OP_VTRNR, // VTRN, right result
11775 OP_MOVLANE // Move lane. RHSID is the lane to move into
11776 };
11777
11778 if (OpNum == OP_COPY) {
11779 if (LHSID == (1 * 9 + 2) * 9 + 3)
11780 return LHS;
11781 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
11782 return RHS;
11783 }
11784
11785 if (OpNum == OP_MOVLANE) {
11786 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
11787 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
11788 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
11789 Elt = 3 - Elt;
11790 while (Elt > 0) {
11791 ID /= 9;
11792 Elt--;
11793 }
11794 return (ID % 9 == 8) ? -1 : ID % 9;
11795 };
11796
11797 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
11798 // get the lane to move from the PFID, which is always from the
11799 // original vectors (V1 or V2).
11801 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
11802 EVT VT = OpLHS.getValueType();
11803 assert(RHSID < 8 && "Expected a lane index for RHSID!");
11804 unsigned ExtLane = 0;
11805 SDValue Input;
11806
11807 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
11808 // convert into a higher type.
11809 if (RHSID & 0x4) {
11810 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
11811 if (MaskElt == -1)
11812 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
11813 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
11814 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
11815 Input = MaskElt < 2 ? V1 : V2;
11816 if (VT.getScalarSizeInBits() == 16) {
11817 Input = DAG.getBitcast(MVT::v2f32, Input);
11818 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
11819 } else {
11820 assert(VT.getScalarSizeInBits() == 32 &&
11821 "Expected 16 or 32 bit shuffle elemements");
11822 Input = DAG.getBitcast(MVT::v2f64, Input);
11823 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
11824 }
11825 } else {
11826 int MaskElt = getPFIDLane(ID, RHSID);
11827 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
11828 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
11829 Input = MaskElt < 4 ? V1 : V2;
11830 // Be careful about creating illegal types. Use f16 instead of i16.
11831 if (VT == MVT::v4i16) {
11832 Input = DAG.getBitcast(MVT::v4f16, Input);
11833 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
11834 }
11835 }
11838 Input, DAG.getVectorIdxConstant(ExtLane, dl));
11839 SDValue Ins =
11840 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
11841 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
11842 return DAG.getBitcast(VT, Ins);
11843 }
11844
11845 SDValue OpLHS, OpRHS;
11846 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
11847 RHS, DAG, dl);
11848 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
11849 RHS, DAG, dl);
11850 EVT VT = OpLHS.getValueType();
11851
11852 switch (OpNum) {
11853 default:
11854 llvm_unreachable("Unknown shuffle opcode!");
11855 case OP_VREV:
11856 // VREV divides the vector in half and swaps within the half.
11857 if (VT.getVectorElementType() == MVT::i32 ||
11858 VT.getVectorElementType() == MVT::f32)
11859 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
11860 // vrev <4 x i16> -> REV32
11861 if (VT.getVectorElementType() == MVT::i16 ||
11862 VT.getVectorElementType() == MVT::f16 ||
11863 VT.getVectorElementType() == MVT::bf16)
11864 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
11865 // vrev <4 x i8> -> REV16
11866 assert(VT.getVectorElementType() == MVT::i8);
11867 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
11868 case OP_VDUP0:
11869 case OP_VDUP1:
11870 case OP_VDUP2:
11871 case OP_VDUP3: {
11872 EVT EltTy = VT.getVectorElementType();
11873 unsigned Opcode;
11874 if (EltTy == MVT::i8)
11875 Opcode = AArch64ISD::DUPLANE8;
11876 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
11877 Opcode = AArch64ISD::DUPLANE16;
11878 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
11879 Opcode = AArch64ISD::DUPLANE32;
11880 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
11881 Opcode = AArch64ISD::DUPLANE64;
11882 else
11883 llvm_unreachable("Invalid vector element type?");
11884
11885 if (VT.getSizeInBits() == 64)
11886 OpLHS = WidenVector(OpLHS, DAG);
11887 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
11888 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
11889 }
11890 case OP_VEXT1:
11891 case OP_VEXT2:
11892 case OP_VEXT3: {
11893 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
11894 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
11895 DAG.getConstant(Imm, dl, MVT::i32));
11896 }
11897 case OP_VUZPL:
11898 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
11899 case OP_VUZPR:
11900 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
11901 case OP_VZIPL:
11902 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
11903 case OP_VZIPR:
11904 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
11905 case OP_VTRNL:
11906 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
11907 case OP_VTRNR:
11908 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
11909 }
11910}
11911
11913 SelectionDAG &DAG) {
11914 // Check to see if we can use the TBL instruction.
11915 SDValue V1 = Op.getOperand(0);
11916 SDValue V2 = Op.getOperand(1);
11917 SDLoc DL(Op);
11918
11919 EVT EltVT = Op.getValueType().getVectorElementType();
11920 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
11921
11922 bool Swap = false;
11923 if (V1.isUndef() || isZerosVector(V1.getNode())) {
11924 std::swap(V1, V2);
11925 Swap = true;
11926 }
11927
11928 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
11929 // out of range values with 0s. We do need to make sure that any out-of-range
11930 // values are really out-of-range for a v16i8 vector.
11931 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
11932 MVT IndexVT = MVT::v8i8;
11933 unsigned IndexLen = 8;
11934 if (Op.getValueSizeInBits() == 128) {
11935 IndexVT = MVT::v16i8;
11936 IndexLen = 16;
11937 }
11938
11940 for (int Val : ShuffleMask) {
11941 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
11942 unsigned Offset = Byte + Val * BytesPerElt;
11943 if (Swap)
11944 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
11945 if (IsUndefOrZero && Offset >= IndexLen)
11946 Offset = 255;
11947 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
11948 }
11949 }
11950
11951 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
11952 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
11953
11954 SDValue Shuffle;
11955 if (IsUndefOrZero) {
11956 if (IndexLen == 8)
11957 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
11958 Shuffle = DAG.getNode(
11959 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
11960 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
11961 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
11962 } else {
11963 if (IndexLen == 8) {
11964 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
11965 Shuffle = DAG.getNode(
11966 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
11967 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
11968 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
11969 } else {
11970 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
11971 // cannot currently represent the register constraints on the input
11972 // table registers.
11973 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
11974 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
11975 // IndexLen));
11976 Shuffle = DAG.getNode(
11977 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
11978 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
11979 V2Cst,
11980 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
11981 }
11982 }
11983 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
11984}
11985
11986static unsigned getDUPLANEOp(EVT EltType) {
11987 if (EltType == MVT::i8)
11988 return AArch64ISD::DUPLANE8;
11989 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
11990 return AArch64ISD::DUPLANE16;
11991 if (EltType == MVT::i32 || EltType == MVT::f32)
11992 return AArch64ISD::DUPLANE32;
11993 if (EltType == MVT::i64 || EltType == MVT::f64)
11994 return AArch64ISD::DUPLANE64;
11995
11996 llvm_unreachable("Invalid vector element type?");
11997}
11998
11999static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12000 unsigned Opcode, SelectionDAG &DAG) {
12001 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12002 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12003 // Match: dup (bitcast (extract_subv X, C)), LaneC
12004 if (BitCast.getOpcode() != ISD::BITCAST ||
12006 return false;
12007
12008 // The extract index must align in the destination type. That may not
12009 // happen if the bitcast is from narrow to wide type.
12010 SDValue Extract = BitCast.getOperand(0);
12011 unsigned ExtIdx = Extract.getConstantOperandVal(1);
12012 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12013 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12014 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12015 if (ExtIdxInBits % CastedEltBitWidth != 0)
12016 return false;
12017
12018 // Can't handle cases where vector size is not 128-bit
12019 if (!Extract.getOperand(0).getValueType().is128BitVector())
12020 return false;
12021
12022 // Update the lane value by offsetting with the scaled extract index.
12023 LaneC += ExtIdxInBits / CastedEltBitWidth;
12024
12025 // Determine the casted vector type of the wide vector input.
12026 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12027 // Examples:
12028 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12029 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12030 unsigned SrcVecNumElts =
12031 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
12033 SrcVecNumElts);
12034 return true;
12035 };
12036 MVT CastVT;
12037 if (getScaledOffsetDup(V, Lane, CastVT)) {
12038 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
12039 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12040 V.getOperand(0).getValueType().is128BitVector()) {
12041 // The lane is incremented by the index of the extract.
12042 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12043 Lane += V.getConstantOperandVal(1);
12044 V = V.getOperand(0);
12045 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12046 // The lane is decremented if we are splatting from the 2nd operand.
12047 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12048 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12049 Lane -= Idx * VT.getVectorNumElements() / 2;
12050 V = WidenVector(V.getOperand(Idx), DAG);
12051 } else if (VT.getSizeInBits() == 64) {
12052 // Widen the operand to 128-bit register with undef.
12053 V = WidenVector(V, DAG);
12054 }
12055 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12056}
12057
12058// Return true if we can get a new shuffle mask by checking the parameter mask
12059// array to test whether every two adjacent mask values are continuous and
12060// starting from an even number.
12062 SmallVectorImpl<int> &NewMask) {
12063 unsigned NumElts = VT.getVectorNumElements();
12064 if (NumElts % 2 != 0)
12065 return false;
12066
12067 NewMask.clear();
12068 for (unsigned i = 0; i < NumElts; i += 2) {
12069 int M0 = M[i];
12070 int M1 = M[i + 1];
12071
12072 // If both elements are undef, new mask is undef too.
12073 if (M0 == -1 && M1 == -1) {
12074 NewMask.push_back(-1);
12075 continue;
12076 }
12077
12078 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
12079 NewMask.push_back(M1 / 2);
12080 continue;
12081 }
12082
12083 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
12084 NewMask.push_back(M0 / 2);
12085 continue;
12086 }
12087
12088 NewMask.clear();
12089 return false;
12090 }
12091
12092 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
12093 return true;
12094}
12095
12096// Try to widen element type to get a new mask value for a better permutation
12097// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12098// UZP1/2, TRN1/2, REV, INS, etc.
12099// For example:
12100// shufflevector <4 x i32> %a, <4 x i32> %b,
12101// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12102// is equivalent to:
12103// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12104// Finally, we can get:
12105// mov v0.d[0], v1.d[1]
12107 SDLoc DL(Op);
12108 EVT VT = Op.getValueType();
12109 EVT ScalarVT = VT.getVectorElementType();
12110 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12111 SDValue V0 = Op.getOperand(0);
12112 SDValue V1 = Op.getOperand(1);
12113 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12114
12115 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12116 // We need to make sure the wider element type is legal. Thus, ElementSize
12117 // should be not larger than 32 bits, and i1 type should also be excluded.
12118 if (ElementSize > 32 || ElementSize == 1)
12119 return SDValue();
12120
12121 SmallVector<int, 8> NewMask;
12122 if (isWideTypeMask(Mask, VT, NewMask)) {
12123 MVT NewEltVT = VT.isFloatingPoint()
12124 ? MVT::getFloatingPointVT(ElementSize * 2)
12125 : MVT::getIntegerVT(ElementSize * 2);
12126 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12127 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12128 V0 = DAG.getBitcast(NewVT, V0);
12129 V1 = DAG.getBitcast(NewVT, V1);
12130 return DAG.getBitcast(VT,
12131 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
12132 }
12133 }
12134
12135 return SDValue();
12136}
12137
12138// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12140 ArrayRef<int> ShuffleMask,
12141 SelectionDAG &DAG) {
12142 SDValue Tbl1 = Op->getOperand(0);
12143 SDValue Tbl2 = Op->getOperand(1);
12144 SDLoc dl(Op);
12145 SDValue Tbl2ID =
12146 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12147
12148 EVT VT = Op.getValueType();
12149 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12150 Tbl1->getOperand(0) != Tbl2ID ||
12152 Tbl2->getOperand(0) != Tbl2ID)
12153 return SDValue();
12154
12155 if (Tbl1->getValueType(0) != MVT::v16i8 ||
12156 Tbl2->getValueType(0) != MVT::v16i8)
12157 return SDValue();
12158
12159 SDValue Mask1 = Tbl1->getOperand(3);
12160 SDValue Mask2 = Tbl2->getOperand(3);
12161 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
12162 for (unsigned I = 0; I < 16; I++) {
12163 if (ShuffleMask[I] < 16)
12164 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
12165 else {
12166 auto *C =
12167 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
12168 if (!C)
12169 return SDValue();
12170 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
12171 }
12172 }
12173
12174 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
12175 SDValue ID =
12176 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12177
12178 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12179 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
12180 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
12181}
12182
12183// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12184// but we don't have an appropriate instruction,
12185// so custom-lower it as ZIP1-with-zeros.
12186SDValue
12187AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12188 SelectionDAG &DAG) const {
12189 SDLoc dl(Op);
12190 EVT VT = Op.getValueType();
12191 SDValue SrcOp = Op.getOperand(0);
12192 EVT SrcVT = SrcOp.getValueType();
12193 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
12194 "Unexpected extension factor.");
12195 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12196 // FIXME: support multi-step zipping?
12197 if (Scale != 2)
12198 return SDValue();
12199 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
12200 return DAG.getBitcast(VT,
12201 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
12202}
12203
12204SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12205 SelectionDAG &DAG) const {
12206 SDLoc dl(Op);
12207 EVT VT = Op.getValueType();
12208
12209 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
12210
12211 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12212 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12213
12214 // Convert shuffles that are directly supported on NEON to target-specific
12215 // DAG nodes, instead of keeping them as shuffles and matching them again
12216 // during code selection. This is more efficient and avoids the possibility
12217 // of inconsistencies between legalization and selection.
12218 ArrayRef<int> ShuffleMask = SVN->getMask();
12219
12220 SDValue V1 = Op.getOperand(0);
12221 SDValue V2 = Op.getOperand(1);
12222
12223 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12224 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12225 "Unexpected VECTOR_SHUFFLE mask size!");
12226
12227 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12228 return Res;
12229
12230 if (SVN->isSplat()) {
12231 int Lane = SVN->getSplatIndex();
12232 // If this is undef splat, generate it via "just" vdup, if possible.
12233 if (Lane == -1)
12234 Lane = 0;
12235
12236 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12237 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
12238 V1.getOperand(0));
12239 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12240 // constant. If so, we can just reference the lane's definition directly.
12241 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12242 !isa<ConstantSDNode>(V1.getOperand(Lane)))
12243 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
12244
12245 // Otherwise, duplicate from the lane of the input vector.
12246 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
12247 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
12248 }
12249
12250 // Check if the mask matches a DUP for a wider element
12251 for (unsigned LaneSize : {64U, 32U, 16U}) {
12252 unsigned Lane = 0;
12253 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
12254 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
12255 : LaneSize == 32 ? AArch64ISD::DUPLANE32
12257 // Cast V1 to an integer vector with required lane size
12258 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
12259 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12260 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
12261 V1 = DAG.getBitcast(NewVecTy, V1);
12262 // Constuct the DUP instruction
12263 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
12264 // Cast back to the original type
12265 return DAG.getBitcast(VT, V1);
12266 }
12267 }
12268
12269 if (isREVMask(ShuffleMask, VT, 64))
12270 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
12271 if (isREVMask(ShuffleMask, VT, 32))
12272 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
12273 if (isREVMask(ShuffleMask, VT, 16))
12274 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
12275
12276 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
12277 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
12278 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
12279 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
12280 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12281 DAG.getConstant(8, dl, MVT::i32));
12282 }
12283
12284 bool ReverseEXT = false;
12285 unsigned Imm;
12286 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
12287 if (ReverseEXT)
12288 std::swap(V1, V2);
12289 Imm *= getExtFactor(V1);
12290 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12291 DAG.getConstant(Imm, dl, MVT::i32));
12292 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
12293 Imm *= getExtFactor(V1);
12294 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
12295 DAG.getConstant(Imm, dl, MVT::i32));
12296 }
12297
12298 unsigned WhichResult;
12299 if (isZIPMask(ShuffleMask, VT, WhichResult)) {
12300 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12301 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12302 }
12303 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
12304 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12305 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12306 }
12307 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
12308 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12309 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12310 }
12311
12312 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12313 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12314 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12315 }
12316 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12317 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12318 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12319 }
12320 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12321 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12322 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12323 }
12324
12326 return Concat;
12327
12328 bool DstIsLeft;
12329 int Anomaly;
12330 int NumInputElements = V1.getValueType().getVectorNumElements();
12331 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
12332 SDValue DstVec = DstIsLeft ? V1 : V2;
12333 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
12334
12335 SDValue SrcVec = V1;
12336 int SrcLane = ShuffleMask[Anomaly];
12337 if (SrcLane >= NumInputElements) {
12338 SrcVec = V2;
12339 SrcLane -= VT.getVectorNumElements();
12340 }
12341 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
12342
12343 EVT ScalarVT = VT.getVectorElementType();
12344
12345 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
12346 ScalarVT = MVT::i32;
12347
12348 return DAG.getNode(
12349 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
12350 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
12351 DstLaneV);
12352 }
12353
12354 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
12355 return NewSD;
12356
12357 // If the shuffle is not directly supported and it has 4 elements, use
12358 // the PerfectShuffle-generated table to synthesize it from other shuffles.
12359 unsigned NumElts = VT.getVectorNumElements();
12360 if (NumElts == 4) {
12361 unsigned PFIndexes[4];
12362 for (unsigned i = 0; i != 4; ++i) {
12363 if (ShuffleMask[i] < 0)
12364 PFIndexes[i] = 8;
12365 else
12366 PFIndexes[i] = ShuffleMask[i];
12367 }
12368
12369 // Compute the index in the perfect shuffle table.
12370 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
12371 PFIndexes[2] * 9 + PFIndexes[3];
12372 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
12373 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
12374 dl);
12375 }
12376
12377 return GenerateTBL(Op, ShuffleMask, DAG);
12378}
12379
12380SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
12381 SelectionDAG &DAG) const {
12382 EVT VT = Op.getValueType();
12383
12384 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12385 return LowerToScalableOp(Op, DAG);
12386
12387 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
12388 "Unexpected vector type!");
12389
12390 // We can handle the constant cases during isel.
12391 if (isa<ConstantSDNode>(Op.getOperand(0)))
12392 return Op;
12393
12394 // There isn't a natural way to handle the general i1 case, so we use some
12395 // trickery with whilelo.
12396 SDLoc DL(Op);
12397 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
12398 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
12399 DAG.getValueType(MVT::i1));
12400 SDValue ID =
12401 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
12402 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12403 if (VT == MVT::nxv1i1)
12404 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
12405 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
12406 Zero, SplatVal),
12407 Zero);
12408 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
12409}
12410
12411SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
12412 SelectionDAG &DAG) const {
12413 SDLoc DL(Op);
12414
12415 EVT VT = Op.getValueType();
12416 if (!isTypeLegal(VT) || !VT.isScalableVector())
12417 return SDValue();
12418
12419 // Current lowering only supports the SVE-ACLE types.
12421 return SDValue();
12422
12423 // The DUPQ operation is indepedent of element type so normalise to i64s.
12424 SDValue Idx128 = Op.getOperand(2);
12425
12426 // DUPQ can be used when idx is in range.
12427 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
12428 if (CIdx && (CIdx->getZExtValue() <= 3)) {
12429 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
12430 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
12431 }
12432
12433 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
12434
12435 // The ACLE says this must produce the same result as:
12436 // svtbl(data, svadd_x(svptrue_b64(),
12437 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
12438 // index * 2))
12439 SDValue One = DAG.getConstant(1, DL, MVT::i64);
12440 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
12441
12442 // create the vector 0,1,0,1,...
12443 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
12444 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
12445
12446 // create the vector idx64,idx64+1,idx64,idx64+1,...
12447 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
12448 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
12449 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
12450
12451 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
12452 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
12453 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
12454}
12455
12456
12457static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
12458 APInt &UndefBits) {
12459 EVT VT = BVN->getValueType(0);
12460 APInt SplatBits, SplatUndef;
12461 unsigned SplatBitSize;
12462 bool HasAnyUndefs;
12463 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
12464 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
12465
12466 for (unsigned i = 0; i < NumSplats; ++i) {
12467 CnstBits <<= SplatBitSize;
12468 UndefBits <<= SplatBitSize;
12469 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
12470 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
12471 }
12472
12473 return true;
12474 }
12475
12476 return false;
12477}
12478
12479// Try 64-bit splatted SIMD immediate.
12480static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12481 const APInt &Bits) {
12482 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12483 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12484 EVT VT = Op.getValueType();
12485 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
12486
12489
12490 SDLoc dl(Op);
12491 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12492 DAG.getConstant(Value, dl, MVT::i32));
12493 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12494 }
12495 }
12496
12497 return SDValue();
12498}
12499
12500// Try 32-bit splatted SIMD immediate.
12501static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12502 const APInt &Bits,
12503 const SDValue *LHS = nullptr) {
12504 EVT VT = Op.getValueType();
12505 if (VT.isFixedLengthVector() &&
12507 return SDValue();
12508
12509 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12510 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12511 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12512 bool isAdvSIMDModImm = false;
12513 uint64_t Shift;
12514
12515 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
12517 Shift = 0;
12518 }
12519 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
12521 Shift = 8;
12522 }
12523 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
12525 Shift = 16;
12526 }
12527 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
12529 Shift = 24;
12530 }
12531
12532 if (isAdvSIMDModImm) {
12533 SDLoc dl(Op);
12534 SDValue Mov;
12535
12536 if (LHS)
12537 Mov = DAG.getNode(NewOp, dl, MovTy,
12538 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12539 DAG.getConstant(Value, dl, MVT::i32),
12540 DAG.getConstant(Shift, dl, MVT::i32));
12541 else
12542 Mov = DAG.getNode(NewOp, dl, MovTy,
12543 DAG.getConstant(Value, dl, MVT::i32),
12544 DAG.getConstant(Shift, dl, MVT::i32));
12545
12546 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12547 }
12548 }
12549
12550 return SDValue();
12551}
12552
12553// Try 16-bit splatted SIMD immediate.
12554static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12555 const APInt &Bits,
12556 const SDValue *LHS = nullptr) {
12557 EVT VT = Op.getValueType();
12558 if (VT.isFixedLengthVector() &&
12560 return SDValue();
12561
12562 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12563 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12564 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
12565 bool isAdvSIMDModImm = false;
12566 uint64_t Shift;
12567
12568 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
12570 Shift = 0;
12571 }
12572 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
12574 Shift = 8;
12575 }
12576
12577 if (isAdvSIMDModImm) {
12578 SDLoc dl(Op);
12579 SDValue Mov;
12580
12581 if (LHS)
12582 Mov = DAG.getNode(NewOp, dl, MovTy,
12583 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12584 DAG.getConstant(Value, dl, MVT::i32),
12585 DAG.getConstant(Shift, dl, MVT::i32));
12586 else
12587 Mov = DAG.getNode(NewOp, dl, MovTy,
12588 DAG.getConstant(Value, dl, MVT::i32),
12589 DAG.getConstant(Shift, dl, MVT::i32));
12590
12591 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12592 }
12593 }
12594
12595 return SDValue();
12596}
12597
12598// Try 32-bit splatted SIMD immediate with shifted ones.
12600 SelectionDAG &DAG, const APInt &Bits) {
12601 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12602 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12603 EVT VT = Op.getValueType();
12604 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12605 bool isAdvSIMDModImm = false;
12606 uint64_t Shift;
12607
12608 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
12610 Shift = 264;
12611 }
12612 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
12614 Shift = 272;
12615 }
12616
12617 if (isAdvSIMDModImm) {
12618 SDLoc dl(Op);
12619 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12620 DAG.getConstant(Value, dl, MVT::i32),
12621 DAG.getConstant(Shift, dl, MVT::i32));
12622 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12623 }
12624 }
12625
12626 return SDValue();
12627}
12628
12629// Try 8-bit splatted SIMD immediate.
12630static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12631 const APInt &Bits) {
12632 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12633 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12634 EVT VT = Op.getValueType();
12635 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
12636
12639
12640 SDLoc dl(Op);
12641 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12642 DAG.getConstant(Value, dl, MVT::i32));
12643 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12644 }
12645 }
12646
12647 return SDValue();
12648}
12649
12650// Try FP splatted SIMD immediate.
12651static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12652 const APInt &Bits) {
12653 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12654 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12655 EVT VT = Op.getValueType();
12656 bool isWide = (VT.getSizeInBits() == 128);
12657 MVT MovTy;
12658 bool isAdvSIMDModImm = false;
12659
12660 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
12662 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
12663 }
12664 else if (isWide &&
12665 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
12667 MovTy = MVT::v2f64;
12668 }
12669
12670 if (isAdvSIMDModImm) {
12671 SDLoc dl(Op);
12672 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12673 DAG.getConstant(Value, dl, MVT::i32));
12674 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12675 }
12676 }
12677
12678 return SDValue();
12679}
12680
12681// Specialized code to quickly find if PotentialBVec is a BuildVector that
12682// consists of only the same constant int value, returned in reference arg
12683// ConstVal
12684static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
12685 uint64_t &ConstVal) {
12686 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
12687 if (!Bvec)
12688 return false;
12689 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
12690 if (!FirstElt)
12691 return false;
12692 EVT VT = Bvec->getValueType(0);
12693 unsigned NumElts = VT.getVectorNumElements();
12694 for (unsigned i = 1; i < NumElts; ++i)
12695 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
12696 return false;
12697 ConstVal = FirstElt->getZExtValue();
12698 return true;
12699}
12700
12702 // Look through cast.
12703 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
12704 N = N.getOperand(0);
12705
12706 return ISD::isConstantSplatVectorAllZeros(N.getNode());
12707}
12708
12710 unsigned NumElts = N.getValueType().getVectorMinNumElements();
12711
12712 // Look through cast.
12713 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
12714 N = N.getOperand(0);
12715 // When reinterpreting from a type with fewer elements the "new" elements
12716 // are not active, so bail if they're likely to be used.
12717 if (N.getValueType().getVectorMinNumElements() < NumElts)
12718 return false;
12719 }
12720
12721 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
12722 return true;
12723
12724 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
12725 // or smaller than the implicit element type represented by N.
12726 // NOTE: A larger element count implies a smaller element type.
12727 if (N.getOpcode() == AArch64ISD::PTRUE &&
12728 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
12729 return N.getValueType().getVectorMinNumElements() >= NumElts;
12730
12731 // If we're compiling for a specific vector-length, we can check if the
12732 // pattern's VL equals that of the scalable vector at runtime.
12733 if (N.getOpcode() == AArch64ISD::PTRUE) {
12734 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
12735 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
12736 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
12737 if (MaxSVESize && MinSVESize == MaxSVESize) {
12738 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
12739 unsigned PatNumElts =
12740 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
12741 return PatNumElts == (NumElts * VScale);
12742 }
12743 }
12744
12745 return false;
12746}
12747
12748// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
12749// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
12750// BUILD_VECTORs with constant element C1, C2 is a constant, and:
12751// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
12752// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
12753// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
12755 EVT VT = N->getValueType(0);
12756
12757 if (!VT.isVector())
12758 return SDValue();
12759
12760 SDLoc DL(N);
12761
12762 SDValue And;
12763 SDValue Shift;
12764
12765 SDValue FirstOp = N->getOperand(0);
12766 unsigned FirstOpc = FirstOp.getOpcode();
12767 SDValue SecondOp = N->getOperand(1);
12768 unsigned SecondOpc = SecondOp.getOpcode();
12769
12770 // Is one of the operands an AND or a BICi? The AND may have been optimised to
12771 // a BICi in order to use an immediate instead of a register.
12772 // Is the other operand an shl or lshr? This will have been turned into:
12773 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
12774 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
12775 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
12776 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
12777 SecondOpc == AArch64ISD::SHL_PRED ||
12778 SecondOpc == AArch64ISD::SRL_PRED)) {
12779 And = FirstOp;
12780 Shift = SecondOp;
12781
12782 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
12783 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
12784 FirstOpc == AArch64ISD::SHL_PRED ||
12785 FirstOpc == AArch64ISD::SRL_PRED)) {
12786 And = SecondOp;
12787 Shift = FirstOp;
12788 } else
12789 return SDValue();
12790
12791 bool IsAnd = And.getOpcode() == ISD::AND;
12792 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
12794 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
12796
12797 // Is the shift amount constant and are all lanes active?
12798 uint64_t C2;
12799 if (ShiftHasPredOp) {
12800 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
12801 return SDValue();
12802 APInt C;
12804 return SDValue();
12805 C2 = C.getZExtValue();
12806 } else if (ConstantSDNode *C2node =
12807 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
12808 C2 = C2node->getZExtValue();
12809 else
12810 return SDValue();
12811
12812 APInt C1AsAPInt;
12813 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
12814 if (IsAnd) {
12815 // Is the and mask vector all constant?
12816 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
12817 return SDValue();
12818 } else {
12819 // Reconstruct the corresponding AND immediate from the two BICi immediates.
12820 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
12821 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
12822 assert(C1nodeImm && C1nodeShift);
12823 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
12824 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
12825 }
12826
12827 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
12828 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
12829 // how much one can shift elements of a particular size?
12830 if (C2 > ElemSizeInBits)
12831 return SDValue();
12832
12833 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
12834 : APInt::getLowBitsSet(ElemSizeInBits, C2);
12835 if (C1AsAPInt != RequiredC1)
12836 return SDValue();
12837
12838 SDValue X = And.getOperand(0);
12839 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
12840 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
12841 : Shift.getOperand(1);
12842
12843 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
12844 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
12845
12846 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
12847 LLVM_DEBUG(N->dump(&DAG));
12848 LLVM_DEBUG(dbgs() << "into: \n");
12849 LLVM_DEBUG(ResultSLI->dump(&DAG));
12850
12851 ++NumShiftInserts;
12852 return ResultSLI;
12853}
12854
12855SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
12856 SelectionDAG &DAG) const {
12857 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
12858 !Subtarget->isNeonAvailable()))
12859 return LowerToScalableOp(Op, DAG);
12860
12861 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
12862 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
12863 return Res;
12864
12865 EVT VT = Op.getValueType();
12866 if (VT.isScalableVector())
12867 return Op;
12868
12869 SDValue LHS = Op.getOperand(0);
12870 BuildVectorSDNode *BVN =
12871 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
12872 if (!BVN) {
12873 // OR commutes, so try swapping the operands.
12874 LHS = Op.getOperand(1);
12875 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
12876 }
12877 if (!BVN)
12878 return Op;
12879
12880 APInt DefBits(VT.getSizeInBits(), 0);
12881 APInt UndefBits(VT.getSizeInBits(), 0);
12882 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
12883 SDValue NewOp;
12884
12885 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
12886 DefBits, &LHS)) ||
12887 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
12888 DefBits, &LHS)))
12889 return NewOp;
12890
12891 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
12892 UndefBits, &LHS)) ||
12893 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
12894 UndefBits, &LHS)))
12895 return NewOp;
12896 }
12897
12898 // We can always fall back to a non-immediate OR.
12899 return Op;
12900}
12901
12902// Normalize the operands of BUILD_VECTOR. The value of constant operands will
12903// be truncated to fit element width.
12905 SelectionDAG &DAG) {
12906 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12907 SDLoc dl(Op);
12908 EVT VT = Op.getValueType();
12909 EVT EltTy= VT.getVectorElementType();
12910
12911 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
12912 return Op;
12913
12915 for (SDValue Lane : Op->ops()) {
12916 // For integer vectors, type legalization would have promoted the
12917 // operands already. Otherwise, if Op is a floating-point splat
12918 // (with operands cast to integers), then the only possibilities
12919 // are constants and UNDEFs.
12920 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
12921 APInt LowBits(EltTy.getSizeInBits(),
12922 CstLane->getZExtValue());
12923 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
12924 } else if (Lane.getNode()->isUndef()) {
12925 Lane = DAG.getUNDEF(MVT::i32);
12926 } else {
12927 assert(Lane.getValueType() == MVT::i32 &&
12928 "Unexpected BUILD_VECTOR operand type");
12929 }
12930 Ops.push_back(Lane);
12931 }
12932 return DAG.getBuildVector(VT, dl, Ops);
12933}
12934
12936 EVT VT = Op.getValueType();
12937
12938 APInt DefBits(VT.getSizeInBits(), 0);
12939 APInt UndefBits(VT.getSizeInBits(), 0);
12940 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
12941 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
12942 SDValue NewOp;
12943 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
12944 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
12945 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
12946 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
12947 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
12948 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
12949 return NewOp;
12950
12951 DefBits = ~DefBits;
12952 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
12953 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
12954 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
12955 return NewOp;
12956
12957 DefBits = UndefBits;
12958 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
12959 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
12960 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
12961 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
12962 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
12963 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
12964 return NewOp;
12965
12966 DefBits = ~UndefBits;
12967 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
12968 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
12969 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
12970 return NewOp;
12971 }
12972
12973 return SDValue();
12974}
12975
12976SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
12977 SelectionDAG &DAG) const {
12978 EVT VT = Op.getValueType();
12979
12980 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
12981 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
12982 SDLoc DL(Op);
12983 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
12984 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
12985 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
12986 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
12987 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
12988 }
12989
12990 // Revert to common legalisation for all other variants.
12991 return SDValue();
12992 }
12993
12994 // Try to build a simple constant vector.
12995 Op = NormalizeBuildVector(Op, DAG);
12996 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
12997 // abort.
12998 if (Op.getOpcode() != ISD::BUILD_VECTOR)
12999 return SDValue();
13000
13001 // Certain vector constants, used to express things like logical NOT and
13002 // arithmetic NEG, are passed through unmodified. This allows special
13003 // patterns for these operations to match, which will lower these constants
13004 // to whatever is proven necessary.
13005 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13006 if (BVN->isConstant()) {
13007 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13008 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13009 APInt Val(BitSize,
13010 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
13011 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13012 return Op;
13013 }
13014 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13015 if (Const->isZero() && !Const->isNegative())
13016 return Op;
13017 }
13018
13019 if (SDValue V = ConstantBuildVector(Op, DAG))
13020 return V;
13021
13022 // Scan through the operands to find some interesting properties we can
13023 // exploit:
13024 // 1) If only one value is used, we can use a DUP, or
13025 // 2) if only the low element is not undef, we can just insert that, or
13026 // 3) if only one constant value is used (w/ some non-constant lanes),
13027 // we can splat the constant value into the whole vector then fill
13028 // in the non-constant lanes.
13029 // 4) FIXME: If different constant values are used, but we can intelligently
13030 // select the values we'll be overwriting for the non-constant
13031 // lanes such that we can directly materialize the vector
13032 // some other way (MOVI, e.g.), we can be sneaky.
13033 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13034 SDLoc dl(Op);
13035 unsigned NumElts = VT.getVectorNumElements();
13036 bool isOnlyLowElement = true;
13037 bool usesOnlyOneValue = true;
13038 bool usesOnlyOneConstantValue = true;
13039 bool isConstant = true;
13040 bool AllLanesExtractElt = true;
13041 unsigned NumConstantLanes = 0;
13042 unsigned NumDifferentLanes = 0;
13043 unsigned NumUndefLanes = 0;
13044 SDValue Value;
13045 SDValue ConstantValue;
13046 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
13047 unsigned ConsecutiveValCount = 0;
13048 SDValue PrevVal;
13049 for (unsigned i = 0; i < NumElts; ++i) {
13050 SDValue V = Op.getOperand(i);
13051 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13052 AllLanesExtractElt = false;
13053 if (V.isUndef()) {
13054 ++NumUndefLanes;
13055 continue;
13056 }
13057 if (i > 0)
13058 isOnlyLowElement = false;
13059 if (!isIntOrFPConstant(V))
13060 isConstant = false;
13061
13062 if (isIntOrFPConstant(V)) {
13063 ++NumConstantLanes;
13064 if (!ConstantValue.getNode())
13065 ConstantValue = V;
13066 else if (ConstantValue != V)
13067 usesOnlyOneConstantValue = false;
13068 }
13069
13070 if (!Value.getNode())
13071 Value = V;
13072 else if (V != Value) {
13073 usesOnlyOneValue = false;
13074 ++NumDifferentLanes;
13075 }
13076
13077 if (PrevVal != V) {
13078 ConsecutiveValCount = 0;
13079 PrevVal = V;
13080 }
13081
13082 // Keep different values and its last consecutive count. For example,
13083 //
13084 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13085 // t24, t24, t24, t24, t24, t24, t24, t24
13086 // t23 = consecutive count 8
13087 // t24 = consecutive count 8
13088 // ------------------------------------------------------------------
13089 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13090 // t24, t24, t24, t24, t24, t24, t24, t24
13091 // t23 = consecutive count 5
13092 // t24 = consecutive count 9
13093 DifferentValueMap[V] = ++ConsecutiveValCount;
13094 }
13095
13096 if (!Value.getNode()) {
13097 LLVM_DEBUG(
13098 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13099 return DAG.getUNDEF(VT);
13100 }
13101
13102 // Convert BUILD_VECTOR where all elements but the lowest are undef into
13103 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13104 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13105 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
13106 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13107 "SCALAR_TO_VECTOR node\n");
13108 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
13109 }
13110
13111 if (AllLanesExtractElt) {
13112 SDNode *Vector = nullptr;
13113 bool Even = false;
13114 bool Odd = false;
13115 // Check whether the extract elements match the Even pattern <0,2,4,...> or
13116 // the Odd pattern <1,3,5,...>.
13117 for (unsigned i = 0; i < NumElts; ++i) {
13118 SDValue V = Op.getOperand(i);
13119 const SDNode *N = V.getNode();
13120 if (!isa<ConstantSDNode>(N->getOperand(1))) {
13121 Even = false;
13122 Odd = false;
13123 break;
13124 }
13125 SDValue N0 = N->getOperand(0);
13126
13127 // All elements are extracted from the same vector.
13128 if (!Vector) {
13129 Vector = N0.getNode();
13130 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
13131 // BUILD_VECTOR.
13132 if (VT.getVectorElementType() !=
13134 break;
13135 } else if (Vector != N0.getNode()) {
13136 Odd = false;
13137 Even = false;
13138 break;
13139 }
13140
13141 // Extracted values are either at Even indices <0,2,4,...> or at Odd
13142 // indices <1,3,5,...>.
13143 uint64_t Val = N->getConstantOperandVal(1);
13144 if (Val == 2 * i) {
13145 Even = true;
13146 continue;
13147 }
13148 if (Val - 1 == 2 * i) {
13149 Odd = true;
13150 continue;
13151 }
13152
13153 // Something does not match: abort.
13154 Odd = false;
13155 Even = false;
13156 break;
13157 }
13158 if (Even || Odd) {
13159 SDValue LHS =
13161 DAG.getConstant(0, dl, MVT::i64));
13162 SDValue RHS =
13164 DAG.getConstant(NumElts, dl, MVT::i64));
13165
13166 if (Even && !Odd)
13167 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
13168 RHS);
13169 if (Odd && !Even)
13170 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
13171 RHS);
13172 }
13173 }
13174
13175 // Use DUP for non-constant splats. For f32 constant splats, reduce to
13176 // i32 and try again.
13177 if (usesOnlyOneValue) {
13178 if (!isConstant) {
13179 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13180 Value.getValueType() != VT) {
13181 LLVM_DEBUG(
13182 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13183 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
13184 }
13185
13186 // This is actually a DUPLANExx operation, which keeps everything vectory.
13187
13188 SDValue Lane = Value.getOperand(1);
13189 Value = Value.getOperand(0);
13190 if (Value.getValueSizeInBits() == 64) {
13191 LLVM_DEBUG(
13192 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13193 "widening it\n");
13194 Value = WidenVector(Value, DAG);
13195 }
13196
13197 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
13198 return DAG.getNode(Opcode, dl, VT, Value, Lane);
13199 }
13200
13203 EVT EltTy = VT.getVectorElementType();
13204 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
13205 EltTy == MVT::f64) && "Unsupported floating-point vector type");
13206 LLVM_DEBUG(
13207 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13208 "BITCASTS, and try again\n");
13209 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
13210 for (unsigned i = 0; i < NumElts; ++i)
13211 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
13212 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
13213 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
13214 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13215 Val.dump(););
13216 Val = LowerBUILD_VECTOR(Val, DAG);
13217 if (Val.getNode())
13218 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
13219 }
13220 }
13221
13222 // If we need to insert a small number of different non-constant elements and
13223 // the vector width is sufficiently large, prefer using DUP with the common
13224 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13225 // skip the constant lane handling below.
13226 bool PreferDUPAndInsert =
13227 !isConstant && NumDifferentLanes >= 1 &&
13228 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
13229 NumDifferentLanes >= NumConstantLanes;
13230
13231 // If there was only one constant value used and for more than one lane,
13232 // start by splatting that value, then replace the non-constant lanes. This
13233 // is better than the default, which will perform a separate initialization
13234 // for each lane.
13235 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
13236 // Firstly, try to materialize the splat constant.
13237 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
13238 unsigned BitSize = VT.getScalarSizeInBits();
13239 APInt ConstantValueAPInt(1, 0);
13240 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
13241 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
13242 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
13243 !ConstantValueAPInt.isAllOnes()) {
13244 Val = ConstantBuildVector(Val, DAG);
13245 if (!Val)
13246 // Otherwise, materialize the constant and splat it.
13247 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
13248 }
13249
13250 // Now insert the non-constant lanes.
13251 for (unsigned i = 0; i < NumElts; ++i) {
13252 SDValue V = Op.getOperand(i);
13253 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13254 if (!isIntOrFPConstant(V))
13255 // Note that type legalization likely mucked about with the VT of the
13256 // source operand, so we may have to convert it here before inserting.
13257 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
13258 }
13259 return Val;
13260 }
13261
13262 // This will generate a load from the constant pool.
13263 if (isConstant) {
13264 LLVM_DEBUG(
13265 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
13266 "expansion\n");
13267 return SDValue();
13268 }
13269
13270 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13271 // v4i32s. This is really a truncate, which we can construct out of (legal)
13272 // concats and truncate nodes.
13274 return M;
13275
13276 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
13277 if (NumElts >= 4) {
13278 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
13279 return Shuffle;
13280
13281 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
13282 return Shuffle;
13283 }
13284
13285 if (PreferDUPAndInsert) {
13286 // First, build a constant vector with the common element.
13287 SmallVector<SDValue, 8> Ops(NumElts, Value);
13288 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
13289 // Next, insert the elements that do not match the common value.
13290 for (unsigned I = 0; I < NumElts; ++I)
13291 if (Op.getOperand(I) != Value)
13292 NewVector =
13293 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
13294 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
13295
13296 return NewVector;
13297 }
13298
13299 // If vector consists of two different values, try to generate two DUPs and
13300 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
13301 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
13303 // Check the consecutive count of the value is the half number of vector
13304 // elements. In this case, we can use CONCAT_VECTORS. For example,
13305 //
13306 // canUseVECTOR_CONCAT = true;
13307 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13308 // t24, t24, t24, t24, t24, t24, t24, t24
13309 //
13310 // canUseVECTOR_CONCAT = false;
13311 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
13312 // t24, t24, t24, t24, t24, t24, t24, t24
13313 bool canUseVECTOR_CONCAT = true;
13314 for (auto Pair : DifferentValueMap) {
13315 // Check different values have same length which is NumElts / 2.
13316 if (Pair.second != NumElts / 2)
13317 canUseVECTOR_CONCAT = false;
13318 Vals.push_back(Pair.first);
13319 }
13320
13321 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
13322 // CONCAT_VECTORs. For example,
13323 //
13324 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
13325 // t24, t24, t24, t24, t24, t24, t24, t24
13326 // ==>
13327 // t26: v8i8 = AArch64ISD::DUP t23
13328 // t28: v8i8 = AArch64ISD::DUP t24
13329 // t29: v16i8 = concat_vectors t26, t28
13330 if (canUseVECTOR_CONCAT) {
13331 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13332 if (isTypeLegal(SubVT) && SubVT.isVector() &&
13333 SubVT.getVectorNumElements() >= 2) {
13334 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
13335 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
13336 SDValue DUP1 =
13337 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
13338 SDValue DUP2 =
13339 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
13341 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
13342 return CONCAT_VECTORS;
13343 }
13344 }
13345
13346 // Let's try to generate VECTOR_SHUFFLE. For example,
13347 //
13348 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
13349 // ==>
13350 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
13351 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
13352 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
13353 if (NumElts >= 8) {
13354 SmallVector<int, 16> MaskVec;
13355 // Build mask for VECTOR_SHUFLLE.
13356 SDValue FirstLaneVal = Op.getOperand(0);
13357 for (unsigned i = 0; i < NumElts; ++i) {
13358 SDValue Val = Op.getOperand(i);
13359 if (FirstLaneVal == Val)
13360 MaskVec.push_back(i);
13361 else
13362 MaskVec.push_back(i + NumElts);
13363 }
13364
13365 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
13366 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
13367 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
13368 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
13370 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
13371 return VECTOR_SHUFFLE;
13372 }
13373 }
13374
13375 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
13376 // know the default expansion would otherwise fall back on something even
13377 // worse. For a vector with one or two non-undef values, that's
13378 // scalar_to_vector for the elements followed by a shuffle (provided the
13379 // shuffle is valid for the target) and materialization element by element
13380 // on the stack followed by a load for everything else.
13381 if (!isConstant && !usesOnlyOneValue) {
13382 LLVM_DEBUG(
13383 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
13384 "of INSERT_VECTOR_ELT\n");
13385
13386 SDValue Vec = DAG.getUNDEF(VT);
13387 SDValue Op0 = Op.getOperand(0);
13388 unsigned i = 0;
13389
13390 // Use SCALAR_TO_VECTOR for lane zero to
13391 // a) Avoid a RMW dependency on the full vector register, and
13392 // b) Allow the register coalescer to fold away the copy if the
13393 // value is already in an S or D register, and we're forced to emit an
13394 // INSERT_SUBREG that we can't fold anywhere.
13395 //
13396 // We also allow types like i8 and i16 which are illegal scalar but legal
13397 // vector element types. After type-legalization the inserted value is
13398 // extended (i32) and it is safe to cast them to the vector type by ignoring
13399 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
13400 if (!Op0.isUndef()) {
13401 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
13402 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
13403 ++i;
13404 }
13405 LLVM_DEBUG(if (i < NumElts) dbgs()
13406 << "Creating nodes for the other vector elements:\n";);
13407 for (; i < NumElts; ++i) {
13408 SDValue V = Op.getOperand(i);
13409 if (V.isUndef())
13410 continue;
13411 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13412 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
13413 }
13414 return Vec;
13415 }
13416
13417 LLVM_DEBUG(
13418 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
13419 "better alternative\n");
13420 return SDValue();
13421}
13422
13423SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
13424 SelectionDAG &DAG) const {
13425 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13426 !Subtarget->isNeonAvailable()))
13427 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
13428
13429 assert(Op.getValueType().isScalableVector() &&
13430 isTypeLegal(Op.getValueType()) &&
13431 "Expected legal scalable vector type!");
13432
13433 if (isTypeLegal(Op.getOperand(0).getValueType())) {
13434 unsigned NumOperands = Op->getNumOperands();
13435 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
13436 "Unexpected number of operands in CONCAT_VECTORS");
13437
13438 if (NumOperands == 2)
13439 return Op;
13440
13441 // Concat each pair of subvectors and pack into the lower half of the array.
13442 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
13443 while (ConcatOps.size() > 1) {
13444 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
13445 SDValue V1 = ConcatOps[I];
13446 SDValue V2 = ConcatOps[I + 1];
13447 EVT SubVT = V1.getValueType();
13448 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
13449 ConcatOps[I / 2] =
13450 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
13451 }
13452 ConcatOps.resize(ConcatOps.size() / 2);
13453 }
13454 return ConcatOps[0];
13455 }
13456
13457 return SDValue();
13458}
13459
13460SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13461 SelectionDAG &DAG) const {
13462 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
13463
13464 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13465 !Subtarget->isNeonAvailable()))
13466 return LowerFixedLengthInsertVectorElt(Op, DAG);
13467
13468 EVT VT = Op.getOperand(0).getValueType();
13469
13470 if (VT.getScalarType() == MVT::i1) {
13471 EVT VectorVT = getPromotedVTForPredicate(VT);
13472 SDLoc DL(Op);
13473 SDValue ExtendedVector =
13474 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
13475 SDValue ExtendedValue =
13476 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
13477 VectorVT.getScalarType().getSizeInBits() < 32
13478 ? MVT::i32
13479 : VectorVT.getScalarType());
13480 ExtendedVector =
13481 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
13482 ExtendedValue, Op.getOperand(2));
13483 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
13484 }
13485
13486 // Check for non-constant or out of range lane.
13487 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
13488 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13489 return SDValue();
13490
13491 return Op;
13492}
13493
13494SDValue
13495AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13496 SelectionDAG &DAG) const {
13497 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
13498 EVT VT = Op.getOperand(0).getValueType();
13499
13500 if (VT.getScalarType() == MVT::i1) {
13501 // We can't directly extract from an SVE predicate; extend it first.
13502 // (This isn't the only possible lowering, but it's straightforward.)
13503 EVT VectorVT = getPromotedVTForPredicate(VT);
13504 SDLoc DL(Op);
13505 SDValue Extend =
13506 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
13507 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
13508 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
13509 Extend, Op.getOperand(1));
13510 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
13511 }
13512
13513 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13514 return LowerFixedLengthExtractVectorElt(Op, DAG);
13515
13516 // Check for non-constant or out of range lane.
13517 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13518 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13519 return SDValue();
13520
13521 // Insertion/extraction are legal for V128 types.
13522 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
13523 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
13524 VT == MVT::v8f16 || VT == MVT::v8bf16)
13525 return Op;
13526
13527 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
13528 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
13529 VT != MVT::v4bf16)
13530 return SDValue();
13531
13532 // For V64 types, we perform extraction by expanding the value
13533 // to a V128 type and perform the extraction on that.
13534 SDLoc DL(Op);
13535 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
13536 EVT WideTy = WideVec.getValueType();
13537
13538 EVT ExtrTy = WideTy.getVectorElementType();
13539 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
13540 ExtrTy = MVT::i32;
13541
13542 // For extractions, we just return the result directly.
13543 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
13544 Op.getOperand(1));
13545}
13546
13547SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
13548 SelectionDAG &DAG) const {
13549 assert(Op.getValueType().isFixedLengthVector() &&
13550 "Only cases that extract a fixed length vector are supported!");
13551
13552 EVT InVT = Op.getOperand(0).getValueType();
13553 unsigned Idx = Op.getConstantOperandVal(1);
13554 unsigned Size = Op.getValueSizeInBits();
13555
13556 // If we don't have legal types yet, do nothing
13557 if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
13558 return SDValue();
13559
13560 if (InVT.isScalableVector()) {
13561 // This will be matched by custom code during ISelDAGToDAG.
13562 if (Idx == 0 && isPackedVectorType(InVT, DAG))
13563 return Op;
13564
13565 return SDValue();
13566 }
13567
13568 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
13569 if (Idx == 0 && InVT.getSizeInBits() <= 128)
13570 return Op;
13571
13572 // If this is extracting the upper 64-bits of a 128-bit vector, we match
13573 // that directly.
13574 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
13575 InVT.getSizeInBits() == 128 && Subtarget->isNeonAvailable())
13576 return Op;
13577
13578 if (useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
13579 SDLoc DL(Op);
13580
13581 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
13582 SDValue NewInVec =
13583 convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
13584
13585 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
13586 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
13587 return convertFromScalableVector(DAG, Op.getValueType(), Splice);
13588 }
13589
13590 return SDValue();
13591}
13592
13593SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
13594 SelectionDAG &DAG) const {
13595 assert(Op.getValueType().isScalableVector() &&
13596 "Only expect to lower inserts into scalable vectors!");
13597
13598 EVT InVT = Op.getOperand(1).getValueType();
13599 unsigned Idx = Op.getConstantOperandVal(2);
13600
13601 SDValue Vec0 = Op.getOperand(0);
13602 SDValue Vec1 = Op.getOperand(1);
13603 SDLoc DL(Op);
13604 EVT VT = Op.getValueType();
13605
13606 if (InVT.isScalableVector()) {
13607 if (!isTypeLegal(VT))
13608 return SDValue();
13609
13610 // Break down insert_subvector into simpler parts.
13611 if (VT.getVectorElementType() == MVT::i1) {
13612 unsigned NumElts = VT.getVectorMinNumElements();
13613 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13614
13615 SDValue Lo, Hi;
13616 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13617 DAG.getVectorIdxConstant(0, DL));
13618 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13619 DAG.getVectorIdxConstant(NumElts / 2, DL));
13620 if (Idx < (NumElts / 2)) {
13621 SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
13623 return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi);
13624 } else {
13625 SDValue NewHi =
13626 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
13627 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
13628 return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi);
13629 }
13630 }
13631
13632 // Ensure the subvector is half the size of the main vector.
13633 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
13634 return SDValue();
13635
13636 // Here narrow and wide refers to the vector element types. After "casting"
13637 // both vectors must have the same bit length and so because the subvector
13638 // has fewer elements, those elements need to be bigger.
13641
13642 // NOP cast operands to the largest legal vector of the same element count.
13643 if (VT.isFloatingPoint()) {
13644 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
13645 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
13646 } else {
13647 // Legal integer vectors are already their largest so Vec0 is fine as is.
13648 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
13649 }
13650
13651 // To replace the top/bottom half of vector V with vector SubV we widen the
13652 // preserved half of V, concatenate this to SubV (the order depending on the
13653 // half being replaced) and then narrow the result.
13654 SDValue Narrow;
13655 if (Idx == 0) {
13656 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
13657 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
13658 } else {
13660 "Invalid subvector index!");
13661 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
13662 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
13663 }
13664
13665 return getSVESafeBitCast(VT, Narrow, DAG);
13666 }
13667
13668 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
13669 // This will be matched by custom code during ISelDAGToDAG.
13670 if (Vec0.isUndef())
13671 return Op;
13672
13673 std::optional<unsigned> PredPattern =
13675 auto PredTy = VT.changeVectorElementType(MVT::i1);
13676 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
13677 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
13678 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
13679 }
13680
13681 return SDValue();
13682}
13683
13684static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
13685 if (Op.getOpcode() != AArch64ISD::DUP &&
13686 Op.getOpcode() != ISD::SPLAT_VECTOR &&
13687 Op.getOpcode() != ISD::BUILD_VECTOR)
13688 return false;
13689
13690 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
13691 !isAllConstantBuildVector(Op, SplatVal))
13692 return false;
13693
13694 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
13695 !isa<ConstantSDNode>(Op->getOperand(0)))
13696 return false;
13697
13698 SplatVal = Op->getConstantOperandVal(0);
13699 if (Op.getValueType().getVectorElementType() != MVT::i64)
13700 SplatVal = (int32_t)SplatVal;
13701
13702 Negated = false;
13703 if (isPowerOf2_64(SplatVal))
13704 return true;
13705
13706 Negated = true;
13707 if (isPowerOf2_64(-SplatVal)) {
13708 SplatVal = -SplatVal;
13709 return true;
13710 }
13711
13712 return false;
13713}
13714
13715SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
13716 EVT VT = Op.getValueType();
13717 SDLoc dl(Op);
13718
13719 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
13720 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
13721
13722 assert(VT.isScalableVector() && "Expected a scalable vector.");
13723
13724 bool Signed = Op.getOpcode() == ISD::SDIV;
13725 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
13726
13727 bool Negated;
13728 uint64_t SplatVal;
13729 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
13730 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
13731 SDValue Res =
13732 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
13733 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
13734 if (Negated)
13735 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
13736
13737 return Res;
13738 }
13739
13740 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
13741 return LowerToPredicatedOp(Op, DAG, PredOpcode);
13742
13743 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
13744 // operations, and truncate the result.
13745 EVT WidenedVT;
13746 if (VT == MVT::nxv16i8)
13747 WidenedVT = MVT::nxv8i16;
13748 else if (VT == MVT::nxv8i16)
13749 WidenedVT = MVT::nxv4i32;
13750 else
13751 llvm_unreachable("Unexpected Custom DIV operation");
13752
13753 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
13754 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
13755 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
13756 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
13757 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
13758 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
13759 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
13760 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
13761 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
13762}
13763
13765 // Currently no fixed length shuffles that require SVE are legal.
13766 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13767 return false;
13768
13769 if (VT.getVectorNumElements() == 4 &&
13770 (VT.is128BitVector() || VT.is64BitVector())) {
13771 unsigned Cost = getPerfectShuffleCost(M);
13772 if (Cost <= 1)
13773 return true;
13774 }
13775
13776 bool DummyBool;
13777 int DummyInt;
13778 unsigned DummyUnsigned;
13779
13780 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
13781 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
13782 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
13783 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
13784 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
13785 isZIPMask(M, VT, DummyUnsigned) ||
13786 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
13787 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
13788 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
13789 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
13790 isConcatMask(M, VT, VT.getSizeInBits() == 128));
13791}
13792
13794 EVT VT) const {
13795 // Just delegate to the generic legality, clear masks aren't special.
13796 return isShuffleMaskLegal(M, VT);
13797}
13798
13799/// getVShiftImm - Check if this is a valid build_vector for the immediate
13800/// operand of a vector shift operation, where all the elements of the
13801/// build_vector must have the same constant integer value.
13802static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
13803 // Ignore bit_converts.
13804 while (Op.getOpcode() == ISD::BITCAST)
13805 Op = Op.getOperand(0);
13806 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
13807 APInt SplatBits, SplatUndef;
13808 unsigned SplatBitSize;
13809 bool HasAnyUndefs;
13810 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
13811 HasAnyUndefs, ElementBits) ||
13812 SplatBitSize > ElementBits)
13813 return false;
13814 Cnt = SplatBits.getSExtValue();
13815 return true;
13816}
13817
13818/// isVShiftLImm - Check if this is a valid build_vector for the immediate
13819/// operand of a vector shift left operation. That value must be in the range:
13820/// 0 <= Value < ElementBits for a left shift; or
13821/// 0 <= Value <= ElementBits for a long left shift.
13822static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
13823 assert(VT.isVector() && "vector shift count is not a vector type");
13824 int64_t ElementBits = VT.getScalarSizeInBits();
13825 if (!getVShiftImm(Op, ElementBits, Cnt))
13826 return false;
13827 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
13828}
13829
13830/// isVShiftRImm - Check if this is a valid build_vector for the immediate
13831/// operand of a vector shift right operation. The value must be in the range:
13832/// 1 <= Value <= ElementBits for a right shift; or
13833static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
13834 assert(VT.isVector() && "vector shift count is not a vector type");
13835 int64_t ElementBits = VT.getScalarSizeInBits();
13836 if (!getVShiftImm(Op, ElementBits, Cnt))
13837 return false;
13838 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
13839}
13840
13841SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
13842 SelectionDAG &DAG) const {
13843 EVT VT = Op.getValueType();
13844
13845 if (VT.getScalarType() == MVT::i1) {
13846 // Lower i1 truncate to `(x & 1) != 0`.
13847 SDLoc dl(Op);
13848 EVT OpVT = Op.getOperand(0).getValueType();
13849 SDValue Zero = DAG.getConstant(0, dl, OpVT);
13850 SDValue One = DAG.getConstant(1, dl, OpVT);
13851 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
13852 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
13853 }
13854
13855 if (!VT.isVector() || VT.isScalableVector())
13856 return SDValue();
13857
13858 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
13859 !Subtarget->isNeonAvailable()))
13860 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
13861
13862 return SDValue();
13863}
13864
13865SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
13866 SelectionDAG &DAG) const {
13867 EVT VT = Op.getValueType();
13868 SDLoc DL(Op);
13869 int64_t Cnt;
13870
13871 if (!Op.getOperand(1).getValueType().isVector())
13872 return Op;
13873 unsigned EltSize = VT.getScalarSizeInBits();
13874
13875 switch (Op.getOpcode()) {
13876 case ISD::SHL:
13877 if (VT.isScalableVector() ||
13879 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
13880
13881 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
13882 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
13883 DAG.getConstant(Cnt, DL, MVT::i32));
13884 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13885 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
13886 MVT::i32),
13887 Op.getOperand(0), Op.getOperand(1));
13888 case ISD::SRA:
13889 case ISD::SRL:
13890 if (VT.isScalableVector() ||
13891 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
13892 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
13894 return LowerToPredicatedOp(Op, DAG, Opc);
13895 }
13896
13897 // Right shift immediate
13898 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
13899 unsigned Opc =
13900 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
13901 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
13902 DAG.getConstant(Cnt, DL, MVT::i32));
13903 }
13904
13905 // Right shift register. Note, there is not a shift right register
13906 // instruction, but the shift left register instruction takes a signed
13907 // value, where negative numbers specify a right shift.
13908 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
13909 : Intrinsic::aarch64_neon_ushl;
13910 // negate the shift amount
13911 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
13912 Op.getOperand(1));
13913 SDValue NegShiftLeft =
13915 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
13916 NegShift);
13917 return NegShiftLeft;
13918 }
13919
13920 llvm_unreachable("unexpected shift opcode");
13921}
13922
13924 AArch64CC::CondCode CC, bool NoNans, EVT VT,
13925 const SDLoc &dl, SelectionDAG &DAG) {
13926 EVT SrcVT = LHS.getValueType();
13927 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
13928 "function only supposed to emit natural comparisons");
13929
13930 APInt SplatValue;
13931 APInt SplatUndef;
13932 unsigned SplatBitSize = 0;
13933 bool HasAnyUndefs;
13934
13935 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
13936 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
13937 SplatBitSize, HasAnyUndefs);
13938
13939 bool IsZero = IsCnst && SplatValue == 0;
13940 bool IsOne =
13941 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
13942 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
13943
13944 if (SrcVT.getVectorElementType().isFloatingPoint()) {
13945 switch (CC) {
13946 default:
13947 return SDValue();
13948 case AArch64CC::NE: {
13949 SDValue Fcmeq;
13950 if (IsZero)
13951 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
13952 else
13953 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
13954 return DAG.getNOT(dl, Fcmeq, VT);
13955 }
13956 case AArch64CC::EQ:
13957 if (IsZero)
13958 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
13959 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
13960 case AArch64CC::GE:
13961 if (IsZero)
13962 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
13963 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
13964 case AArch64CC::GT:
13965 if (IsZero)
13966 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
13967 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
13968 case AArch64CC::LE:
13969 if (!NoNans)
13970 return SDValue();
13971 // If we ignore NaNs then we can use to the LS implementation.
13972 [[fallthrough]];
13973 case AArch64CC::LS:
13974 if (IsZero)
13975 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
13976 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
13977 case AArch64CC::LT:
13978 if (!NoNans)
13979 return SDValue();
13980 // If we ignore NaNs then we can use to the MI implementation.
13981 [[fallthrough]];
13982 case AArch64CC::MI:
13983 if (IsZero)
13984 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
13985 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
13986 }
13987 }
13988
13989 switch (CC) {
13990 default:
13991 return SDValue();
13992 case AArch64CC::NE: {
13993 SDValue Cmeq;
13994 if (IsZero)
13995 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
13996 else
13997 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
13998 return DAG.getNOT(dl, Cmeq, VT);
13999 }
14000 case AArch64CC::EQ:
14001 if (IsZero)
14002 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14003 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14004 case AArch64CC::GE:
14005 if (IsZero)
14006 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
14007 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
14008 case AArch64CC::GT:
14009 if (IsZero)
14010 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
14011 if (IsMinusOne)
14012 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
14013 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
14014 case AArch64CC::LE:
14015 if (IsZero)
14016 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14017 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
14018 case AArch64CC::LS:
14019 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
14020 case AArch64CC::LO:
14021 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
14022 case AArch64CC::LT:
14023 if (IsZero)
14024 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
14025 if (IsOne)
14026 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14027 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
14028 case AArch64CC::HI:
14029 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
14030 case AArch64CC::HS:
14031 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
14032 }
14033}
14034
14035SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14036 SelectionDAG &DAG) const {
14037 if (Op.getValueType().isScalableVector())
14038 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
14039
14040 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14041 !Subtarget->isNeonAvailable()))
14042 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14043
14044 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
14045 SDValue LHS = Op.getOperand(0);
14046 SDValue RHS = Op.getOperand(1);
14047 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14048 SDLoc dl(Op);
14049
14050 if (LHS.getValueType().getVectorElementType().isInteger()) {
14051 assert(LHS.getValueType() == RHS.getValueType());
14053 SDValue Cmp =
14054 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
14055 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14056 }
14057
14058 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14059
14060 // Make v4f16 (only) fcmp operations utilise vector instructions
14061 // v8f16 support will be a litle more complicated
14062 if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
14063 if (LHS.getValueType().getVectorNumElements() == 4) {
14064 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14065 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14066 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14067 DAG.ReplaceAllUsesWith(Op, NewSetcc);
14068 CmpVT = MVT::v4i32;
14069 } else
14070 return SDValue();
14071 }
14072
14073 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
14074 LHS.getValueType().getVectorElementType() != MVT::f128);
14075
14076 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14077 // clean. Some of them require two branches to implement.
14078 AArch64CC::CondCode CC1, CC2;
14079 bool ShouldInvert;
14080 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
14081
14082 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
14083 SDValue Cmp =
14084 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
14085 if (!Cmp.getNode())
14086 return SDValue();
14087
14088 if (CC2 != AArch64CC::AL) {
14089 SDValue Cmp2 =
14090 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
14091 if (!Cmp2.getNode())
14092 return SDValue();
14093
14094 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
14095 }
14096
14097 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14098
14099 if (ShouldInvert)
14100 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
14101
14102 return Cmp;
14103}
14104
14105static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14106 SelectionDAG &DAG) {
14107 SDValue VecOp = ScalarOp.getOperand(0);
14108 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
14109 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14110 DAG.getConstant(0, DL, MVT::i64));
14111}
14112
14113static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14114 SDLoc DL, SelectionDAG &DAG) {
14115 unsigned ScalarOpcode;
14116 switch (Opcode) {
14117 case ISD::VECREDUCE_AND:
14118 ScalarOpcode = ISD::AND;
14119 break;
14120 case ISD::VECREDUCE_OR:
14121 ScalarOpcode = ISD::OR;
14122 break;
14123 case ISD::VECREDUCE_XOR:
14124 ScalarOpcode = ISD::XOR;
14125 break;
14126 default:
14127 llvm_unreachable("Expected bitwise vector reduction");
14128 return SDValue();
14129 }
14130
14131 EVT VecVT = Vec.getValueType();
14132 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14133 "Expected power-of-2 length vector");
14134
14135 EVT ElemVT = VecVT.getVectorElementType();
14136
14137 SDValue Result;
14138 unsigned NumElems = VecVT.getVectorNumElements();
14139
14140 // Special case for boolean reductions
14141 if (ElemVT == MVT::i1) {
14142 // Split large vectors into smaller ones
14143 if (NumElems > 16) {
14144 SDValue Lo, Hi;
14145 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14146 EVT HalfVT = Lo.getValueType();
14147 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
14148 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
14149 }
14150
14151 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14152 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14153 // this element size leads to the best codegen, since e.g. setcc results
14154 // might need to be truncated otherwise.
14155 EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
14156
14157 // any_ext doesn't work with umin/umax, so only use it for uadd.
14158 unsigned ExtendOp =
14159 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14160 SDValue Extended = DAG.getNode(
14161 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
14162 switch (ScalarOpcode) {
14163 case ISD::AND:
14164 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
14165 break;
14166 case ISD::OR:
14167 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
14168 break;
14169 case ISD::XOR:
14170 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
14171 break;
14172 default:
14173 llvm_unreachable("Unexpected Opcode");
14174 }
14175
14176 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
14177 } else {
14178 // Iteratively split the vector in half and combine using the bitwise
14179 // operation until it fits in a 64 bit register.
14180 while (VecVT.getSizeInBits() > 64) {
14181 SDValue Lo, Hi;
14182 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14183 VecVT = Lo.getValueType();
14184 NumElems = VecVT.getVectorNumElements();
14185 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
14186 }
14187
14188 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
14189
14190 // Do the remaining work on a scalar since it allows the code generator to
14191 // combine the shift and bitwise operation into one instruction and since
14192 // integer instructions can have higher throughput than vector instructions.
14193 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
14194
14195 // Iteratively combine the lower and upper halves of the scalar using the
14196 // bitwise operation, halving the relevant region of the scalar in each
14197 // iteration, until the relevant region is just one element of the original
14198 // vector.
14199 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
14200 SDValue ShiftAmount =
14201 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
14202 SDValue Shifted =
14203 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
14204 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
14205 }
14206
14207 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
14208 }
14209
14210 return DAG.getAnyExtOrTrunc(Result, DL, VT);
14211}
14212
14213SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
14214 SelectionDAG &DAG) const {
14215 SDValue Src = Op.getOperand(0);
14216
14217 // Try to lower fixed length reductions to SVE.
14218 EVT SrcVT = Src.getValueType();
14219 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14220 Op.getOpcode() == ISD::VECREDUCE_AND ||
14221 Op.getOpcode() == ISD::VECREDUCE_OR ||
14222 Op.getOpcode() == ISD::VECREDUCE_XOR ||
14223 Op.getOpcode() == ISD::VECREDUCE_FADD ||
14224 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
14225 SrcVT.getVectorElementType() == MVT::i64);
14226 if (SrcVT.isScalableVector() ||
14228 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
14229
14230 if (SrcVT.getVectorElementType() == MVT::i1)
14231 return LowerPredReductionToSVE(Op, DAG);
14232
14233 switch (Op.getOpcode()) {
14234 case ISD::VECREDUCE_ADD:
14235 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
14236 case ISD::VECREDUCE_AND:
14237 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
14238 case ISD::VECREDUCE_OR:
14239 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
14241 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
14243 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
14245 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
14247 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
14248 case ISD::VECREDUCE_XOR:
14249 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
14251 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
14253 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
14255 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
14257 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
14259 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
14260 default:
14261 llvm_unreachable("Unhandled fixed length reduction");
14262 }
14263 }
14264
14265 // Lower NEON reductions.
14266 SDLoc dl(Op);
14267 switch (Op.getOpcode()) {
14268 case ISD::VECREDUCE_AND:
14269 case ISD::VECREDUCE_OR:
14270 case ISD::VECREDUCE_XOR:
14271 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
14272 Op.getValueType(), dl, DAG);
14273 case ISD::VECREDUCE_ADD:
14274 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
14276 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
14278 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
14280 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
14282 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
14283 default:
14284 llvm_unreachable("Unhandled reduction");
14285 }
14286}
14287
14288SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
14289 SelectionDAG &DAG) const {
14290 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14291 // No point replacing if we don't have the relevant instruction/libcall anyway
14292 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
14293 return SDValue();
14294
14295 // LSE has an atomic load-clear instruction, but not a load-and.
14296 SDLoc dl(Op);
14297 MVT VT = Op.getSimpleValueType();
14298 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
14299 SDValue RHS = Op.getOperand(2);
14300 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
14301 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
14302 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
14303 Op.getOperand(0), Op.getOperand(1), RHS,
14304 AN->getMemOperand());
14305}
14306
14307SDValue
14308AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
14309 SelectionDAG &DAG) const {
14310
14311 SDLoc dl(Op);
14312 // Get the inputs.
14313 SDNode *Node = Op.getNode();
14314 SDValue Chain = Op.getOperand(0);
14315 SDValue Size = Op.getOperand(1);
14317 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14318 EVT VT = Node->getValueType(0);
14319
14321 "no-stack-arg-probe")) {
14322 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14323 Chain = SP.getValue(1);
14324 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14325 if (Align)
14326 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14327 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14328 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14329 SDValue Ops[2] = {SP, Chain};
14330 return DAG.getMergeValues(Ops, dl);
14331 }
14332
14333 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
14334
14335 EVT PtrVT = getPointerTy(DAG.getDataLayout());
14337 PtrVT, 0);
14338
14339 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
14340 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
14341 if (Subtarget->hasCustomCallingConv())
14342 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
14343
14344 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
14345 DAG.getConstant(4, dl, MVT::i64));
14346 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
14347 Chain =
14348 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
14349 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
14350 DAG.getRegisterMask(Mask), Chain.getValue(1));
14351 // To match the actual intent better, we should read the output from X15 here
14352 // again (instead of potentially spilling it to the stack), but rereading Size
14353 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
14354 // here.
14355
14356 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
14357 DAG.getConstant(4, dl, MVT::i64));
14358
14359 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14360 Chain = SP.getValue(1);
14361 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14362 if (Align)
14363 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14364 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14365 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14366
14367 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
14368
14369 SDValue Ops[2] = {SP, Chain};
14370 return DAG.getMergeValues(Ops, dl);
14371}
14372
14373SDValue
14374AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
14375 SelectionDAG &DAG) const {
14376 // Get the inputs.
14377 SDNode *Node = Op.getNode();
14378 SDValue Chain = Op.getOperand(0);
14379 SDValue Size = Op.getOperand(1);
14380
14382 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14383 SDLoc dl(Op);
14384 EVT VT = Node->getValueType(0);
14385
14386 // Construct the new SP value in a GPR.
14387 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14388 Chain = SP.getValue(1);
14389 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14390 if (Align)
14391 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14392 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14393
14394 // Set the real SP to the new value with a probing loop.
14395 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
14396 SDValue Ops[2] = {SP, Chain};
14397 return DAG.getMergeValues(Ops, dl);
14398}
14399
14400SDValue
14401AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
14402 SelectionDAG &DAG) const {
14404
14405 if (Subtarget->isTargetWindows())
14406 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
14407 else if (hasInlineStackProbe(MF))
14408 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
14409 else
14410 return SDValue();
14411}
14412
14413// When x and y are extended, lower:
14414// avgfloor(x, y) -> (x + y) >> 1
14415// avgceil(x, y) -> (x + y + 1) >> 1
14416
14417// Otherwise, lower to:
14418// avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1)
14419// avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1)
14420SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
14421 unsigned NewOp) const {
14422 if (Subtarget->hasSVE2())
14423 return LowerToPredicatedOp(Op, DAG, NewOp);
14424
14425 SDLoc dl(Op);
14426 SDValue OpA = Op->getOperand(0);
14427 SDValue OpB = Op->getOperand(1);
14428 EVT VT = Op.getValueType();
14429 bool IsCeil =
14430 (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU);
14431 bool IsSigned =
14432 (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS);
14433 unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
14434
14435 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
14436
14437 auto IsZeroExtended = [&DAG](SDValue &Node) {
14438 KnownBits Known = DAG.computeKnownBits(Node, 0);
14439 return Known.Zero.isSignBitSet();
14440 };
14441
14442 auto IsSignExtended = [&DAG](SDValue &Node) {
14443 return (DAG.ComputeNumSignBits(Node, 0) > 1);
14444 };
14445
14446 SDValue ConstantOne = DAG.getConstant(1, dl, VT);
14447 if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) ||
14448 (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) {
14449 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB);
14450 if (IsCeil)
14451 Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne);
14452 return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne);
14453 }
14454
14455 SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne);
14456 SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne);
14457
14458 SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB);
14459 tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne);
14460 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB);
14461 return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
14462}
14463
14464SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
14465 SelectionDAG &DAG) const {
14466 EVT VT = Op.getValueType();
14467 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
14468
14469 SDLoc DL(Op);
14470 APInt MulImm = Op.getConstantOperandAPInt(0);
14471 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
14472 VT);
14473}
14474
14475/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
14476template <unsigned NumVecs>
14477static bool
14481 // Retrieve EC from first vector argument.
14482 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
14484#ifndef NDEBUG
14485 // Check the assumption that all input vectors are the same type.
14486 for (unsigned I = 0; I < NumVecs; ++I)
14487 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
14488 "Invalid type.");
14489#endif
14490 // memVT is `NumVecs * VT`.
14492 EC * NumVecs);
14493 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
14494 Info.offset = 0;
14495 Info.align.reset();
14497 return true;
14498}
14499
14500/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
14501/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
14502/// specified in the intrinsic calls.
14504 const CallInst &I,
14505 MachineFunction &MF,
14506 unsigned Intrinsic) const {
14507 auto &DL = I.getModule()->getDataLayout();
14508 switch (Intrinsic) {
14509 case Intrinsic::aarch64_sve_st2:
14510 return setInfoSVEStN<2>(*this, DL, Info, I);
14511 case Intrinsic::aarch64_sve_st3:
14512 return setInfoSVEStN<3>(*this, DL, Info, I);
14513 case Intrinsic::aarch64_sve_st4:
14514 return setInfoSVEStN<4>(*this, DL, Info, I);
14515 case Intrinsic::aarch64_neon_ld2:
14516 case Intrinsic::aarch64_neon_ld3:
14517 case Intrinsic::aarch64_neon_ld4:
14518 case Intrinsic::aarch64_neon_ld1x2:
14519 case Intrinsic::aarch64_neon_ld1x3:
14520 case Intrinsic::aarch64_neon_ld1x4: {
14522 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
14523 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14524 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14525 Info.offset = 0;
14526 Info.align.reset();
14527 // volatile loads with NEON intrinsics not supported
14529 return true;
14530 }
14531 case Intrinsic::aarch64_neon_ld2lane:
14532 case Intrinsic::aarch64_neon_ld3lane:
14533 case Intrinsic::aarch64_neon_ld4lane:
14534 case Intrinsic::aarch64_neon_ld2r:
14535 case Intrinsic::aarch64_neon_ld3r:
14536 case Intrinsic::aarch64_neon_ld4r: {
14538 // ldx return struct with the same vec type
14539 Type *RetTy = I.getType();
14540 auto *StructTy = cast<StructType>(RetTy);
14541 unsigned NumElts = StructTy->getNumElements();
14542 Type *VecTy = StructTy->getElementType(0);
14543 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14544 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14545 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14546 Info.offset = 0;
14547 Info.align.reset();
14548 // volatile loads with NEON intrinsics not supported
14550 return true;
14551 }
14552 case Intrinsic::aarch64_neon_st2:
14553 case Intrinsic::aarch64_neon_st3:
14554 case Intrinsic::aarch64_neon_st4:
14555 case Intrinsic::aarch64_neon_st1x2:
14556 case Intrinsic::aarch64_neon_st1x3:
14557 case Intrinsic::aarch64_neon_st1x4: {
14559 unsigned NumElts = 0;
14560 for (const Value *Arg : I.args()) {
14561 Type *ArgTy = Arg->getType();
14562 if (!ArgTy->isVectorTy())
14563 break;
14564 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
14565 }
14566 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14567 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14568 Info.offset = 0;
14569 Info.align.reset();
14570 // volatile stores with NEON intrinsics not supported
14572 return true;
14573 }
14574 case Intrinsic::aarch64_neon_st2lane:
14575 case Intrinsic::aarch64_neon_st3lane:
14576 case Intrinsic::aarch64_neon_st4lane: {
14578 unsigned NumElts = 0;
14579 // all the vector type is same
14580 Type *VecTy = I.getArgOperand(0)->getType();
14581 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14582
14583 for (const Value *Arg : I.args()) {
14584 Type *ArgTy = Arg->getType();
14585 if (!ArgTy->isVectorTy())
14586 break;
14587 NumElts += 1;
14588 }
14589
14590 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14591 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14592 Info.offset = 0;
14593 Info.align.reset();
14594 // volatile stores with NEON intrinsics not supported
14596 return true;
14597 }
14598 case Intrinsic::aarch64_ldaxr:
14599 case Intrinsic::aarch64_ldxr: {
14600 Type *ValTy = I.getParamElementType(0);
14602 Info.memVT = MVT::getVT(ValTy);
14603 Info.ptrVal = I.getArgOperand(0);
14604 Info.offset = 0;
14605 Info.align = DL.getABITypeAlign(ValTy);
14607 return true;
14608 }
14609 case Intrinsic::aarch64_stlxr:
14610 case Intrinsic::aarch64_stxr: {
14611 Type *ValTy = I.getParamElementType(1);
14613 Info.memVT = MVT::getVT(ValTy);
14614 Info.ptrVal = I.getArgOperand(1);
14615 Info.offset = 0;
14616 Info.align = DL.getABITypeAlign(ValTy);
14618 return true;
14619 }
14620 case Intrinsic::aarch64_ldaxp:
14621 case Intrinsic::aarch64_ldxp:
14623 Info.memVT = MVT::i128;
14624 Info.ptrVal = I.getArgOperand(0);
14625 Info.offset = 0;
14626 Info.align = Align(16);
14628 return true;
14629 case Intrinsic::aarch64_stlxp:
14630 case Intrinsic::aarch64_stxp:
14632 Info.memVT = MVT::i128;
14633 Info.ptrVal = I.getArgOperand(2);
14634 Info.offset = 0;
14635 Info.align = Align(16);
14637 return true;
14638 case Intrinsic::aarch64_sve_ldnt1: {
14639 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
14641 Info.memVT = MVT::getVT(I.getType());
14642 Info.ptrVal = I.getArgOperand(1);
14643 Info.offset = 0;
14644 Info.align = DL.getABITypeAlign(ElTy);
14646 return true;
14647 }
14648 case Intrinsic::aarch64_sve_stnt1: {
14649 Type *ElTy =
14650 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
14652 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
14653 Info.ptrVal = I.getArgOperand(2);
14654 Info.offset = 0;
14655 Info.align = DL.getABITypeAlign(ElTy);
14657 return true;
14658 }
14659 case Intrinsic::aarch64_mops_memset_tag: {
14660 Value *Dst = I.getArgOperand(0);
14661 Value *Val = I.getArgOperand(1);
14663 Info.memVT = MVT::getVT(Val->getType());
14664 Info.ptrVal = Dst;
14665 Info.offset = 0;
14666 Info.align = I.getParamAlign(0).valueOrOne();
14668 // The size of the memory being operated on is unknown at this point
14670 return true;
14671 }
14672 default:
14673 break;
14674 }
14675
14676 return false;
14677}
14678
14680 ISD::LoadExtType ExtTy,
14681 EVT NewVT) const {
14682 // TODO: This may be worth removing. Check regression tests for diffs.
14683 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
14684 return false;
14685
14686 // If we're reducing the load width in order to avoid having to use an extra
14687 // instruction to do extension then it's probably a good idea.
14688 if (ExtTy != ISD::NON_EXTLOAD)
14689 return true;
14690 // Don't reduce load width if it would prevent us from combining a shift into
14691 // the offset.
14692 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
14693 assert(Mem);
14694 const SDValue &Base = Mem->getBasePtr();
14695 if (Base.getOpcode() == ISD::ADD &&
14696 Base.getOperand(1).getOpcode() == ISD::SHL &&
14697 Base.getOperand(1).hasOneUse() &&
14698 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
14699 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
14700 if (Mem->getMemoryVT().isScalableVector())
14701 return false;
14702 // The shift can be combined if it matches the size of the value being
14703 // loaded (and so reducing the width would make it not match).
14704 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
14705 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
14706 if (ShiftAmount == Log2_32(LoadBytes))
14707 return false;
14708 }
14709 // We have no reason to disallow reducing the load width, so allow it.
14710 return true;
14711}
14712
14713// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
14715 EVT VT = Extend.getValueType();
14716 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
14717 SDValue Extract = Extend.getOperand(0);
14718 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
14719 Extract = Extract.getOperand(0);
14720 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
14721 EVT VecVT = Extract.getOperand(0).getValueType();
14722 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
14723 return false;
14724 }
14725 }
14726 return true;
14727}
14728
14729// Truncations from 64-bit GPR to 32-bit GPR is free.
14731 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
14732 return false;
14733 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
14734 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
14735 return NumBits1 > NumBits2;
14736}
14738 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
14739 return false;
14740 uint64_t NumBits1 = VT1.getFixedSizeInBits();
14741 uint64_t NumBits2 = VT2.getFixedSizeInBits();
14742 return NumBits1 > NumBits2;
14743}
14744
14745/// Check if it is profitable to hoist instruction in then/else to if.
14746/// Not profitable if I and it's user can form a FMA instruction
14747/// because we prefer FMSUB/FMADD.
14749 if (I->getOpcode() != Instruction::FMul)
14750 return true;
14751
14752 if (!I->hasOneUse())
14753 return true;
14754
14755 Instruction *User = I->user_back();
14756
14757 if (!(User->getOpcode() == Instruction::FSub ||
14758 User->getOpcode() == Instruction::FAdd))
14759 return true;
14760
14762 const Function *F = I->getFunction();
14763 const DataLayout &DL = F->getParent()->getDataLayout();
14764 Type *Ty = User->getOperand(0)->getType();
14765
14766 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
14768 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
14769 Options.UnsafeFPMath));
14770}
14771
14772// All 32-bit GPR operations implicitly zero the high-half of the corresponding
14773// 64-bit GPR.
14775 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
14776 return false;
14777 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
14778 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
14779 return NumBits1 == 32 && NumBits2 == 64;
14780}
14782 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
14783 return false;
14784 unsigned NumBits1 = VT1.getSizeInBits();
14785 unsigned NumBits2 = VT2.getSizeInBits();
14786 return NumBits1 == 32 && NumBits2 == 64;
14787}
14788
14790 EVT VT1 = Val.getValueType();
14791 if (isZExtFree(VT1, VT2)) {
14792 return true;
14793 }
14794
14795 if (Val.getOpcode() != ISD::LOAD)
14796 return false;
14797
14798 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
14799 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
14800 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
14801 VT1.getSizeInBits() <= 32);
14802}
14803
14804bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
14805 if (isa<FPExtInst>(Ext))
14806 return false;
14807
14808 // Vector types are not free.
14809 if (Ext->getType()->isVectorTy())
14810 return false;
14811
14812 for (const Use &U : Ext->uses()) {
14813 // The extension is free if we can fold it with a left shift in an
14814 // addressing mode or an arithmetic operation: add, sub, and cmp.
14815
14816 // Is there a shift?
14817 const Instruction *Instr = cast<Instruction>(U.getUser());
14818
14819 // Is this a constant shift?
14820 switch (Instr->getOpcode()) {
14821 case Instruction::Shl:
14822 if (!isa<ConstantInt>(Instr->getOperand(1)))
14823 return false;
14824 break;
14825 case Instruction::GetElementPtr: {
14826 gep_type_iterator GTI = gep_type_begin(Instr);
14827 auto &DL = Ext->getModule()->getDataLayout();
14828 std::advance(GTI, U.getOperandNo()-1);
14829 Type *IdxTy = GTI.getIndexedType();
14830 // This extension will end up with a shift because of the scaling factor.
14831 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
14832 // Get the shift amount based on the scaling factor:
14833 // log2(sizeof(IdxTy)) - log2(8).
14834 if (IdxTy->isScalableTy())
14835 return false;
14836 uint64_t ShiftAmt =
14837 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
14838 3;
14839 // Is the constant foldable in the shift of the addressing mode?
14840 // I.e., shift amount is between 1 and 4 inclusive.
14841 if (ShiftAmt == 0 || ShiftAmt > 4)
14842 return false;
14843 break;
14844 }
14845 case Instruction::Trunc:
14846 // Check if this is a noop.
14847 // trunc(sext ty1 to ty2) to ty1.
14848 if (Instr->getType() == Ext->getOperand(0)->getType())
14849 continue;
14850 [[fallthrough]];
14851 default:
14852 return false;
14853 }
14854
14855 // At this point we can use the bfm family, so this extension is free
14856 // for that use.
14857 }
14858 return true;
14859}
14860
14861static bool isSplatShuffle(Value *V) {
14862 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
14863 return all_equal(Shuf->getShuffleMask());
14864 return false;
14865}
14866
14867/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
14868/// or upper half of the vector elements.
14869static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
14870 bool AllowSplat = false) {
14871 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
14872 auto *FullTy = FullV->getType();
14873 auto *HalfTy = HalfV->getType();
14874 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
14875 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
14876 };
14877
14878 auto extractHalf = [](Value *FullV, Value *HalfV) {
14879 auto *FullVT = cast<FixedVectorType>(FullV->getType());
14880 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
14881 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
14882 };
14883
14884 ArrayRef<int> M1, M2;
14885 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
14886 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
14887 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
14888 return false;
14889
14890 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
14891 // it is not checked as an extract below.
14892 if (AllowSplat && isSplatShuffle(Op1))
14893 S1Op1 = nullptr;
14894 if (AllowSplat && isSplatShuffle(Op2))
14895 S2Op1 = nullptr;
14896
14897 // Check that the operands are half as wide as the result and we extract
14898 // half of the elements of the input vectors.
14899 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
14900 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
14901 return false;
14902
14903 // Check the mask extracts either the lower or upper half of vector
14904 // elements.
14905 int M1Start = 0;
14906 int M2Start = 0;
14907 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
14908 if ((S1Op1 &&
14909 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
14910 (S2Op1 &&
14911 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
14912 return false;
14913
14914 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
14915 (M2Start != 0 && M2Start != (NumElements / 2)))
14916 return false;
14917 if (S1Op1 && S2Op1 && M1Start != M2Start)
14918 return false;
14919
14920 return true;
14921}
14922
14923/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
14924/// of the vector elements.
14925static bool areExtractExts(Value *Ext1, Value *Ext2) {
14926 auto areExtDoubled = [](Instruction *Ext) {
14927 return Ext->getType()->getScalarSizeInBits() ==
14928 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
14929 };
14930
14931 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
14932 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
14933 !areExtDoubled(cast<Instruction>(Ext1)) ||
14934 !areExtDoubled(cast<Instruction>(Ext2)))
14935 return false;
14936
14937 return true;
14938}
14939
14940/// Check if Op could be used with vmull_high_p64 intrinsic.
14942 Value *VectorOperand = nullptr;
14943 ConstantInt *ElementIndex = nullptr;
14944 return match(Op, m_ExtractElt(m_Value(VectorOperand),
14945 m_ConstantInt(ElementIndex))) &&
14946 ElementIndex->getValue() == 1 &&
14947 isa<FixedVectorType>(VectorOperand->getType()) &&
14948 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
14949}
14950
14951/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
14952static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
14954}
14955
14957 // Restrict ourselves to the form CodeGenPrepare typically constructs.
14958 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
14959 if (!GEP || GEP->getNumOperands() != 2)
14960 return false;
14961
14962 Value *Base = GEP->getOperand(0);
14963 Value *Offsets = GEP->getOperand(1);
14964
14965 // We only care about scalar_base+vector_offsets.
14966 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
14967 return false;
14968
14969 // Sink extends that would allow us to use 32-bit offset vectors.
14970 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
14971 auto *OffsetsInst = cast<Instruction>(Offsets);
14972 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
14973 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
14974 Ops.push_back(&GEP->getOperandUse(1));
14975 }
14976
14977 // Sink the GEP.
14978 return true;
14979}
14980
14981/// We want to sink following cases:
14982/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
14984 if (match(Op, m_VScale()))
14985 return true;
14986 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
14988 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
14989 return true;
14990 }
14991 return false;
14992}
14993
14994/// Check if sinking \p I's operands to I's basic block is profitable, because
14995/// the operands can be folded into a target instruction, e.g.
14996/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
14998 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
14999 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
15000 switch (II->getIntrinsicID()) {
15001 case Intrinsic::aarch64_neon_smull:
15002 case Intrinsic::aarch64_neon_umull:
15003 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
15004 /*AllowSplat=*/true)) {
15005 Ops.push_back(&II->getOperandUse(0));
15006 Ops.push_back(&II->getOperandUse(1));
15007 return true;
15008 }
15009 [[fallthrough]];
15010
15011 case Intrinsic::fma:
15012 if (isa<VectorType>(I->getType()) &&
15013 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
15014 !Subtarget->hasFullFP16())
15015 return false;
15016 [[fallthrough]];
15017 case Intrinsic::aarch64_neon_sqdmull:
15018 case Intrinsic::aarch64_neon_sqdmulh:
15019 case Intrinsic::aarch64_neon_sqrdmulh:
15020 // Sink splats for index lane variants
15021 if (isSplatShuffle(II->getOperand(0)))
15022 Ops.push_back(&II->getOperandUse(0));
15023 if (isSplatShuffle(II->getOperand(1)))
15024 Ops.push_back(&II->getOperandUse(1));
15025 return !Ops.empty();
15026 case Intrinsic::aarch64_neon_fmlal:
15027 case Intrinsic::aarch64_neon_fmlal2:
15028 case Intrinsic::aarch64_neon_fmlsl:
15029 case Intrinsic::aarch64_neon_fmlsl2:
15030 // Sink splats for index lane variants
15031 if (isSplatShuffle(II->getOperand(1)))
15032 Ops.push_back(&II->getOperandUse(1));
15033 if (isSplatShuffle(II->getOperand(2)))
15034 Ops.push_back(&II->getOperandUse(2));
15035 return !Ops.empty();
15036 case Intrinsic::aarch64_sve_ptest_first:
15037 case Intrinsic::aarch64_sve_ptest_last:
15038 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
15039 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15040 Ops.push_back(&II->getOperandUse(0));
15041 return !Ops.empty();
15042 case Intrinsic::aarch64_sme_write_horiz:
15043 case Intrinsic::aarch64_sme_write_vert:
15044 case Intrinsic::aarch64_sme_writeq_horiz:
15045 case Intrinsic::aarch64_sme_writeq_vert: {
15046 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
15047 if (!Idx || Idx->getOpcode() != Instruction::Add)
15048 return false;
15049 Ops.push_back(&II->getOperandUse(1));
15050 return true;
15051 }
15052 case Intrinsic::aarch64_sme_read_horiz:
15053 case Intrinsic::aarch64_sme_read_vert:
15054 case Intrinsic::aarch64_sme_readq_horiz:
15055 case Intrinsic::aarch64_sme_readq_vert:
15056 case Intrinsic::aarch64_sme_ld1b_vert:
15057 case Intrinsic::aarch64_sme_ld1h_vert:
15058 case Intrinsic::aarch64_sme_ld1w_vert:
15059 case Intrinsic::aarch64_sme_ld1d_vert:
15060 case Intrinsic::aarch64_sme_ld1q_vert:
15061 case Intrinsic::aarch64_sme_st1b_vert:
15062 case Intrinsic::aarch64_sme_st1h_vert:
15063 case Intrinsic::aarch64_sme_st1w_vert:
15064 case Intrinsic::aarch64_sme_st1d_vert:
15065 case Intrinsic::aarch64_sme_st1q_vert:
15066 case Intrinsic::aarch64_sme_ld1b_horiz:
15067 case Intrinsic::aarch64_sme_ld1h_horiz:
15068 case Intrinsic::aarch64_sme_ld1w_horiz:
15069 case Intrinsic::aarch64_sme_ld1d_horiz:
15070 case Intrinsic::aarch64_sme_ld1q_horiz:
15071 case Intrinsic::aarch64_sme_st1b_horiz:
15072 case Intrinsic::aarch64_sme_st1h_horiz:
15073 case Intrinsic::aarch64_sme_st1w_horiz:
15074 case Intrinsic::aarch64_sme_st1d_horiz:
15075 case Intrinsic::aarch64_sme_st1q_horiz: {
15076 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
15077 if (!Idx || Idx->getOpcode() != Instruction::Add)
15078 return false;
15079 Ops.push_back(&II->getOperandUse(3));
15080 return true;
15081 }
15082 case Intrinsic::aarch64_neon_pmull:
15083 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
15084 return false;
15085 Ops.push_back(&II->getOperandUse(0));
15086 Ops.push_back(&II->getOperandUse(1));
15087 return true;
15088 case Intrinsic::aarch64_neon_pmull64:
15089 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
15090 II->getArgOperand(1)))
15091 return false;
15092 Ops.push_back(&II->getArgOperandUse(0));
15093 Ops.push_back(&II->getArgOperandUse(1));
15094 return true;
15095 case Intrinsic::masked_gather:
15096 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
15097 return false;
15098 Ops.push_back(&II->getArgOperandUse(0));
15099 return true;
15100 case Intrinsic::masked_scatter:
15101 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
15102 return false;
15103 Ops.push_back(&II->getArgOperandUse(1));
15104 return true;
15105 default:
15106 return false;
15107 }
15108 }
15109
15110 // Sink vscales closer to uses for better isel
15111 switch (I->getOpcode()) {
15112 case Instruction::GetElementPtr:
15113 case Instruction::Add:
15114 case Instruction::Sub:
15115 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
15116 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
15117 Ops.push_back(&I->getOperandUse(Op));
15118 return true;
15119 }
15120 }
15121 break;
15122 default:
15123 break;
15124 }
15125
15126 if (!I->getType()->isVectorTy())
15127 return false;
15128
15129 switch (I->getOpcode()) {
15130 case Instruction::Sub:
15131 case Instruction::Add: {
15132 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
15133 return false;
15134
15135 // If the exts' operands extract either the lower or upper elements, we
15136 // can sink them too.
15137 auto Ext1 = cast<Instruction>(I->getOperand(0));
15138 auto Ext2 = cast<Instruction>(I->getOperand(1));
15139 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
15140 Ops.push_back(&Ext1->getOperandUse(0));
15141 Ops.push_back(&Ext2->getOperandUse(0));
15142 }
15143
15144 Ops.push_back(&I->getOperandUse(0));
15145 Ops.push_back(&I->getOperandUse(1));
15146
15147 return true;
15148 }
15149 case Instruction::Or: {
15150 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15151 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15152 if (Subtarget->hasNEON()) {
15153 Instruction *OtherAnd, *IA, *IB;
15154 Value *MaskValue;
15155 // MainAnd refers to And instruction that has 'Not' as one of its operands
15156 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
15157 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
15158 m_Instruction(IA)))))) {
15159 if (match(OtherAnd,
15160 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
15161 Instruction *MainAnd = I->getOperand(0) == OtherAnd
15162 ? cast<Instruction>(I->getOperand(1))
15163 : cast<Instruction>(I->getOperand(0));
15164
15165 // Both Ands should be in same basic block as Or
15166 if (I->getParent() != MainAnd->getParent() ||
15167 I->getParent() != OtherAnd->getParent())
15168 return false;
15169
15170 // Non-mask operands of both Ands should also be in same basic block
15171 if (I->getParent() != IA->getParent() ||
15172 I->getParent() != IB->getParent())
15173 return false;
15174
15175 Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
15176 Ops.push_back(&I->getOperandUse(0));
15177 Ops.push_back(&I->getOperandUse(1));
15178
15179 return true;
15180 }
15181 }
15182 }
15183
15184 return false;
15185 }
15186 case Instruction::Mul: {
15187 int NumZExts = 0, NumSExts = 0;
15188 for (auto &Op : I->operands()) {
15189 // Make sure we are not already sinking this operand
15190 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
15191 continue;
15192
15193 if (match(&Op, m_SExt(m_Value()))) {
15194 NumSExts++;
15195 continue;
15196 } else if (match(&Op, m_ZExt(m_Value()))) {
15197 NumZExts++;
15198 continue;
15199 }
15200
15201 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
15202
15203 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
15204 // operand and the s/zext can help create indexed s/umull. This is
15205 // especially useful to prevent i64 mul being scalarized.
15206 if (Shuffle && isSplatShuffle(Shuffle) &&
15207 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
15208 Ops.push_back(&Shuffle->getOperandUse(0));
15209 Ops.push_back(&Op);
15210 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
15211 NumSExts++;
15212 else
15213 NumZExts++;
15214 continue;
15215 }
15216
15217 if (!Shuffle)
15218 continue;
15219
15220 Value *ShuffleOperand = Shuffle->getOperand(0);
15221 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
15222 if (!Insert)
15223 continue;
15224
15225 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
15226 if (!OperandInstr)
15227 continue;
15228
15229 ConstantInt *ElementConstant =
15230 dyn_cast<ConstantInt>(Insert->getOperand(2));
15231 // Check that the insertelement is inserting into element 0
15232 if (!ElementConstant || !ElementConstant->isZero())
15233 continue;
15234
15235 unsigned Opcode = OperandInstr->getOpcode();
15236 if (Opcode == Instruction::SExt)
15237 NumSExts++;
15238 else if (Opcode == Instruction::ZExt)
15239 NumZExts++;
15240 else {
15241 // If we find that the top bits are known 0, then we can sink and allow
15242 // the backend to generate a umull.
15243 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
15244 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
15245 const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
15246 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
15247 continue;
15248 NumZExts++;
15249 }
15250
15251 Ops.push_back(&Shuffle->getOperandUse(0));
15252 Ops.push_back(&Op);
15253 }
15254
15255 // Is it profitable to sink if we found two of the same type of extends.
15256 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
15257 }
15258 default:
15259 return false;
15260 }
15261 return false;
15262}
15263
15265 bool IsLittleEndian) {
15266 Value *Op = ZExt->getOperand(0);
15267 auto *SrcTy = cast<FixedVectorType>(Op->getType());
15268 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15269 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15270 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
15271 return false;
15272
15273 assert(DstWidth % SrcWidth == 0 &&
15274 "TBL lowering is not supported for a ZExt instruction with this "
15275 "source & destination element type.");
15276 unsigned ZExtFactor = DstWidth / SrcWidth;
15277 unsigned NumElts = SrcTy->getNumElements();
15278 IRBuilder<> Builder(ZExt);
15279 SmallVector<int> Mask;
15280 // Create a mask that selects <0,...,Op[i]> for each lane of the destination
15281 // vector to replace the original ZExt. This can later be lowered to a set of
15282 // tbl instructions.
15283 for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
15284 if (IsLittleEndian) {
15285 if (i % ZExtFactor == 0)
15286 Mask.push_back(i / ZExtFactor);
15287 else
15288 Mask.push_back(NumElts);
15289 } else {
15290 if ((i + 1) % ZExtFactor == 0)
15291 Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
15292 else
15293 Mask.push_back(NumElts);
15294 }
15295 }
15296
15297 auto *FirstEltZero = Builder.CreateInsertElement(
15298 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
15299 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
15300 Result = Builder.CreateBitCast(Result, DstTy);
15301 if (DstTy != ZExt->getType())
15302 Result = Builder.CreateZExt(Result, ZExt->getType());
15303 ZExt->replaceAllUsesWith(Result);
15304 ZExt->eraseFromParent();
15305 return true;
15306}
15307
15308static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
15309 IRBuilder<> Builder(TI);
15311 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
15312 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
15313 auto *DstTy = cast<FixedVectorType>(TI->getType());
15314 assert(SrcTy->getElementType()->isIntegerTy() &&
15315 "Non-integer type source vector element is not supported");
15316 assert(DstTy->getElementType()->isIntegerTy(8) &&
15317 "Unsupported destination vector element type");
15318 unsigned SrcElemTySz =
15319 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15320 unsigned DstElemTySz =
15321 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15322 assert((SrcElemTySz % DstElemTySz == 0) &&
15323 "Cannot lower truncate to tbl instructions for a source element size "
15324 "that is not divisible by the destination element size");
15325 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
15326 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
15327 "Unsupported source vector element type size");
15328 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
15329
15330 // Create a mask to choose every nth byte from the source vector table of
15331 // bytes to create the truncated destination vector, where 'n' is the truncate
15332 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
15333 // 0,8,16,..Y*8th bytes for the little-endian format
15335 for (int Itr = 0; Itr < 16; Itr++) {
15336 if (Itr < NumElements)
15337 MaskConst.push_back(Builder.getInt8(
15338 IsLittleEndian ? Itr * TruncFactor
15339 : Itr * TruncFactor + (TruncFactor - 1)));
15340 else
15341 MaskConst.push_back(Builder.getInt8(255));
15342 }
15343
15344 int MaxTblSz = 128 * 4;
15345 int MaxSrcSz = SrcElemTySz * NumElements;
15346 int ElemsPerTbl =
15347 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
15348 assert(ElemsPerTbl <= 16 &&
15349 "Maximum elements selected using TBL instruction cannot exceed 16!");
15350
15351 int ShuffleCount = 128 / SrcElemTySz;
15352 SmallVector<int> ShuffleLanes;
15353 for (int i = 0; i < ShuffleCount; ++i)
15354 ShuffleLanes.push_back(i);
15355
15356 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
15357 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
15358 // call TBL & save the result in a vector of TBL results for combining later.
15360 while (ShuffleLanes.back() < NumElements) {
15361 Parts.push_back(Builder.CreateBitCast(
15362 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
15363
15364 if (Parts.size() == 4) {
15366 Intrinsic::aarch64_neon_tbl4, VecTy);
15367 Parts.push_back(ConstantVector::get(MaskConst));
15368 Results.push_back(Builder.CreateCall(F, Parts));
15369 Parts.clear();
15370 }
15371
15372 for (int i = 0; i < ShuffleCount; ++i)
15373 ShuffleLanes[i] += ShuffleCount;
15374 }
15375
15376 assert((Parts.empty() || Results.empty()) &&
15377 "Lowering trunc for vectors requiring different TBL instructions is "
15378 "not supported!");
15379 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
15380 // registers
15381 if (!Parts.empty()) {
15382 Intrinsic::ID TblID;
15383 switch (Parts.size()) {
15384 case 1:
15385 TblID = Intrinsic::aarch64_neon_tbl1;
15386 break;
15387 case 2:
15388 TblID = Intrinsic::aarch64_neon_tbl2;
15389 break;
15390 case 3:
15391 TblID = Intrinsic::aarch64_neon_tbl3;
15392 break;
15393 }
15394
15395 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
15396 Parts.push_back(ConstantVector::get(MaskConst));
15397 Results.push_back(Builder.CreateCall(F, Parts));
15398 }
15399
15400 // Extract the destination vector from TBL result(s) after combining them
15401 // where applicable. Currently, at most two TBLs are supported.
15402 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
15403 "more than 2 tbl instructions!");
15404 Value *FinalResult = Results[0];
15405 if (Results.size() == 1) {
15406 if (ElemsPerTbl < 16) {
15407 SmallVector<int> FinalMask(ElemsPerTbl);
15408 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15409 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
15410 }
15411 } else {
15412 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
15413 if (ElemsPerTbl < 16) {
15414 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
15415 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
15416 } else {
15417 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15418 }
15419 FinalResult =
15420 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
15421 }
15422
15423 TI->replaceAllUsesWith(FinalResult);
15424 TI->eraseFromParent();
15425}
15426
15428 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
15429 // shuffle_vector instructions are serialized when targeting SVE,
15430 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
15431 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
15432 return false;
15433
15434 // Try to optimize conversions using tbl. This requires materializing constant
15435 // index vectors, which can increase code size and add loads. Skip the
15436 // transform unless the conversion is in a loop block guaranteed to execute
15437 // and we are not optimizing for size.
15438 Function *F = I->getParent()->getParent();
15439 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
15440 F->hasOptSize())
15441 return false;
15442
15443 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
15444 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
15445 if (!SrcTy || !DstTy)
15446 return false;
15447
15448 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
15449 // lowered to tbl instructions to insert the original i8 elements
15450 // into i8x lanes. This is enabled for cases where it is beneficial.
15451 auto *ZExt = dyn_cast<ZExtInst>(I);
15452 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
15453 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
15454 if (DstWidth % 8 != 0)
15455 return false;
15456
15457 auto *TruncDstType =
15458 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
15459 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
15460 // the remaining ZExt folded into the user, don't use tbl lowering.
15461 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
15462 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
15465 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
15466 return false;
15467
15468 DstTy = TruncDstType;
15469 }
15470
15471 return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian());
15472 }
15473
15474 auto *UIToFP = dyn_cast<UIToFPInst>(I);
15475 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
15476 DstTy->getElementType()->isFloatTy()) {
15477 IRBuilder<> Builder(I);
15478 auto *ZExt = cast<ZExtInst>(
15479 Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
15480 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
15481 I->replaceAllUsesWith(UI);
15482 I->eraseFromParent();
15483 return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()),
15484 Subtarget->isLittleEndian());
15485 }
15486
15487 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
15488 // followed by a truncate lowered to using tbl.4.
15489 auto *FPToUI = dyn_cast<FPToUIInst>(I);
15490 if (FPToUI &&
15491 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
15492 SrcTy->getElementType()->isFloatTy() &&
15493 DstTy->getElementType()->isIntegerTy(8)) {
15494 IRBuilder<> Builder(I);
15495 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
15496 VectorType::getInteger(SrcTy));
15497 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
15498 I->replaceAllUsesWith(TruncI);
15499 I->eraseFromParent();
15500 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
15501 return true;
15502 }
15503
15504 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
15505 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
15506 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
15507 // registers
15508 auto *TI = dyn_cast<TruncInst>(I);
15509 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
15510 ((SrcTy->getElementType()->isIntegerTy(32) ||
15511 SrcTy->getElementType()->isIntegerTy(64)) &&
15512 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
15513 createTblForTrunc(TI, Subtarget->isLittleEndian());
15514 return true;
15515 }
15516
15517 return false;
15518}
15519
15521 Align &RequiredAligment) const {
15522 if (!LoadedType.isSimple() ||
15523 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
15524 return false;
15525 // Cyclone supports unaligned accesses.
15526 RequiredAligment = Align(1);
15527 unsigned NumBits = LoadedType.getSizeInBits();
15528 return NumBits == 32 || NumBits == 64;
15529}
15530
15531/// A helper function for determining the number of interleaved accesses we
15532/// will generate when lowering accesses of the given type.
15534 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
15535 unsigned VecSize = 128;
15536 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15537 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
15538 if (UseScalable)
15539 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
15540 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
15541}
15542
15545 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
15546 I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
15547 return MOStridedAccess;
15549}
15550
15552 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
15553 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15554 auto EC = VecTy->getElementCount();
15555 unsigned MinElts = EC.getKnownMinValue();
15556
15557 UseScalable = false;
15558
15559 if (!VecTy->isScalableTy() && !Subtarget->hasNEON())
15560 return false;
15561
15562 if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
15563 return false;
15564
15565 // Ensure that the predicate for this number of elements is available.
15566 if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
15567 return false;
15568
15569 // Ensure the number of vector elements is greater than 1.
15570 if (MinElts < 2)
15571 return false;
15572
15573 // Ensure the element type is legal.
15574 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
15575 return false;
15576
15577 if (EC.isScalable()) {
15578 UseScalable = true;
15579 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
15580 }
15581
15582 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
15583 if (!Subtarget->isNeonAvailable() ||
15584 (Subtarget->useSVEForFixedLengthVectors() &&
15585 (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
15586 (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
15587 isPowerOf2_32(MinElts) && VecSize > 128)))) {
15588 UseScalable = true;
15589 return true;
15590 }
15591
15592 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
15593 // 128 will be split into multiple interleaved accesses.
15594 return VecSize == 64 || VecSize % 128 == 0;
15595}
15596
15598 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
15599 return ScalableVectorType::get(VTy->getElementType(), 2);
15600
15601 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
15602 return ScalableVectorType::get(VTy->getElementType(), 4);
15603
15604 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
15605 return ScalableVectorType::get(VTy->getElementType(), 8);
15606
15607 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
15608 return ScalableVectorType::get(VTy->getElementType(), 8);
15609
15610 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
15611 return ScalableVectorType::get(VTy->getElementType(), 2);
15612
15613 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
15614 return ScalableVectorType::get(VTy->getElementType(), 4);
15615
15616 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
15617 return ScalableVectorType::get(VTy->getElementType(), 8);
15618
15619 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
15620 return ScalableVectorType::get(VTy->getElementType(), 16);
15621
15622 llvm_unreachable("Cannot handle input vector type");
15623}
15624
15625static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
15626 bool Scalable, Type *LDVTy,
15627 Type *PtrTy) {
15628 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
15629 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
15630 Intrinsic::aarch64_sve_ld3_sret,
15631 Intrinsic::aarch64_sve_ld4_sret};
15632 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
15633 Intrinsic::aarch64_neon_ld3,
15634 Intrinsic::aarch64_neon_ld4};
15635 if (Scalable)
15636 return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
15637
15638 return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
15639}
15640
15641static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
15642 bool Scalable, Type *STVTy,
15643 Type *PtrTy) {
15644 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
15645 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
15646 Intrinsic::aarch64_sve_st3,
15647 Intrinsic::aarch64_sve_st4};
15648 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
15649 Intrinsic::aarch64_neon_st3,
15650 Intrinsic::aarch64_neon_st4};
15651 if (Scalable)
15652 return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
15653
15654 return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
15655}
15656
15657/// Lower an interleaved load into a ldN intrinsic.
15658///
15659/// E.g. Lower an interleaved load (Factor = 2):
15660/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
15661/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
15662/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
15663///
15664/// Into:
15665/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
15666/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
15667/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
15670 ArrayRef<unsigned> Indices, unsigned Factor) const {
15671 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
15672 "Invalid interleave factor");
15673 assert(!Shuffles.empty() && "Empty shufflevector input");
15674 assert(Shuffles.size() == Indices.size() &&
15675 "Unmatched number of shufflevectors and indices");
15676
15677 const DataLayout &DL = LI->getModule()->getDataLayout();
15678
15679 VectorType *VTy = Shuffles[0]->getType();
15680
15681 // Skip if we do not have NEON and skip illegal vector types. We can
15682 // "legalize" wide vector types into multiple interleaved accesses as long as
15683 // the vector types are divisible by 128.
15684 bool UseScalable;
15685 if (!Subtarget->hasNEON() ||
15686 !isLegalInterleavedAccessType(VTy, DL, UseScalable))
15687 return false;
15688
15689 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
15690
15691 auto *FVTy = cast<FixedVectorType>(VTy);
15692
15693 // A pointer vector can not be the return type of the ldN intrinsics. Need to
15694 // load integer vectors first and then convert to pointer vectors.
15695 Type *EltTy = FVTy->getElementType();
15696 if (EltTy->isPointerTy())
15697 FVTy =
15698 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
15699
15700 // If we're going to generate more than one load, reset the sub-vector type
15701 // to something legal.
15702 FVTy = FixedVectorType::get(FVTy->getElementType(),
15703 FVTy->getNumElements() / NumLoads);
15704
15705 auto *LDVTy =
15706 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
15707
15708 IRBuilder<> Builder(LI);
15709
15710 // The base address of the load.
15711 Value *BaseAddr = LI->getPointerOperand();
15712
15713 Type *PtrTy = LI->getPointerOperandType();
15714 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
15715 LDVTy->getElementCount());
15716
15717 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
15718 UseScalable, LDVTy, PtrTy);
15719
15720 // Holds sub-vectors extracted from the load intrinsic return values. The
15721 // sub-vectors are associated with the shufflevector instructions they will
15722 // replace.
15724
15725 Value *PTrue = nullptr;
15726 if (UseScalable) {
15727 std::optional<unsigned> PgPattern =
15728 getSVEPredPatternFromNumElements(FVTy->getNumElements());
15729 if (Subtarget->getMinSVEVectorSizeInBits() ==
15730 Subtarget->getMaxSVEVectorSizeInBits() &&
15731 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
15732 PgPattern = AArch64SVEPredPattern::all;
15733
15734 auto *PTruePat =
15735 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
15736 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
15737 {PTruePat});
15738 }
15739
15740 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
15741
15742 // If we're generating more than one load, compute the base address of
15743 // subsequent loads as an offset from the previous.
15744 if (LoadCount > 0)
15745 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
15746 FVTy->getNumElements() * Factor);
15747
15748 CallInst *LdN;
15749 if (UseScalable)
15750 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
15751 else
15752 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
15753
15754 // Extract and store the sub-vectors returned by the load intrinsic.
15755 for (unsigned i = 0; i < Shuffles.size(); i++) {
15756 ShuffleVectorInst *SVI = Shuffles[i];
15757 unsigned Index = Indices[i];
15758
15759 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
15760
15761 if (UseScalable)
15762 SubVec = Builder.CreateExtractVector(
15763 FVTy, SubVec,
15765
15766 // Convert the integer vector to pointer vector if the element is pointer.
15767 if (EltTy->isPointerTy())
15768 SubVec = Builder.CreateIntToPtr(
15770 FVTy->getNumElements()));
15771
15772 SubVecs[SVI].push_back(SubVec);
15773 }
15774 }
15775
15776 // Replace uses of the shufflevector instructions with the sub-vectors
15777 // returned by the load intrinsic. If a shufflevector instruction is
15778 // associated with more than one sub-vector, those sub-vectors will be
15779 // concatenated into a single wide vector.
15780 for (ShuffleVectorInst *SVI : Shuffles) {
15781 auto &SubVec = SubVecs[SVI];
15782 auto *WideVec =
15783 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
15784 SVI->replaceAllUsesWith(WideVec);
15785 }
15786
15787 return true;
15788}
15789
15790template <typename Iter>
15791bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
15792 int MaxLookupDist = 20;
15793 unsigned IdxWidth = DL.getIndexSizeInBits(0);
15794 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
15795 const Value *PtrA1 =
15796 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
15797
15798 while (++It != End) {
15799 if (It->isDebugOrPseudoInst())
15800 continue;
15801 if (MaxLookupDist-- == 0)
15802 break;
15803 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
15804 const Value *PtrB1 =
15805 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
15806 DL, OffsetB);
15807 if (PtrA1 == PtrB1 &&
15808 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
15809 .abs() == 16)
15810 return true;
15811 }
15812 }
15813
15814 return false;
15815}
15816
15817/// Lower an interleaved store into a stN intrinsic.
15818///
15819/// E.g. Lower an interleaved store (Factor = 3):
15820/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
15821/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
15822/// store <12 x i32> %i.vec, <12 x i32>* %ptr
15823///
15824/// Into:
15825/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
15826/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
15827/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
15828/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
15829///
15830/// Note that the new shufflevectors will be removed and we'll only generate one
15831/// st3 instruction in CodeGen.
15832///
15833/// Example for a more general valid mask (Factor 3). Lower:
15834/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
15835/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
15836/// store <12 x i32> %i.vec, <12 x i32>* %ptr
15837///
15838/// Into:
15839/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
15840/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
15841/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
15842/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
15844 ShuffleVectorInst *SVI,
15845 unsigned Factor) const {
15846
15847 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
15848 "Invalid interleave factor");
15849
15850 auto *VecTy = cast<FixedVectorType>(SVI->getType());
15851 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
15852
15853 unsigned LaneLen = VecTy->getNumElements() / Factor;
15854 Type *EltTy = VecTy->getElementType();
15855 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
15856
15857 const DataLayout &DL = SI->getModule()->getDataLayout();
15858 bool UseScalable;
15859
15860 // Skip if we do not have NEON and skip illegal vector types. We can
15861 // "legalize" wide vector types into multiple interleaved accesses as long as
15862 // the vector types are divisible by 128.
15863 if (!Subtarget->hasNEON() ||
15864 !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
15865 return false;
15866
15867 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
15868
15869 Value *Op0 = SVI->getOperand(0);
15870 Value *Op1 = SVI->getOperand(1);
15871 IRBuilder<> Builder(SI);
15872
15873 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
15874 // vectors to integer vectors.
15875 if (EltTy->isPointerTy()) {
15876 Type *IntTy = DL.getIntPtrType(EltTy);
15877 unsigned NumOpElts =
15878 cast<FixedVectorType>(Op0->getType())->getNumElements();
15879
15880 // Convert to the corresponding integer vector.
15881 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
15882 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
15883 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
15884
15885 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
15886 }
15887
15888 // If we're going to generate more than one store, reset the lane length
15889 // and sub-vector type to something legal.
15890 LaneLen /= NumStores;
15891 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
15892
15893 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
15894 : SubVecTy;
15895
15896 // The base address of the store.
15897 Value *BaseAddr = SI->getPointerOperand();
15898
15899 auto Mask = SVI->getShuffleMask();
15900
15901 // Sanity check if all the indices are NOT in range.
15902 // If mask is `poison`, `Mask` may be a vector of -1s.
15903 // If all of them are `poison`, OOB read will happen later.
15904 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
15905 return false;
15906 }
15907 // A 64bit st2 which does not start at element 0 will involved adding extra
15908 // ext elements making the st2 unprofitable, and if there is a nearby store
15909 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
15910 // zip;ldp pair which has higher throughput.
15911 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
15912 (Mask[0] != 0 ||
15913 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
15914 DL) ||
15915 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
15916 BaseAddr, DL)))
15917 return false;
15918
15919 Type *PtrTy = SI->getPointerOperandType();
15920 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
15921 STVTy->getElementCount());
15922
15923 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
15924 UseScalable, STVTy, PtrTy);
15925
15926 Value *PTrue = nullptr;
15927 if (UseScalable) {
15928 std::optional<unsigned> PgPattern =
15929 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
15930 if (Subtarget->getMinSVEVectorSizeInBits() ==
15931 Subtarget->getMaxSVEVectorSizeInBits() &&
15932 Subtarget->getMinSVEVectorSizeInBits() ==
15933 DL.getTypeSizeInBits(SubVecTy))
15934 PgPattern = AArch64SVEPredPattern::all;
15935
15936 auto *PTruePat =
15937 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
15938 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
15939 {PTruePat});
15940 }
15941
15942 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
15943
15945
15946 // Split the shufflevector operands into sub vectors for the new stN call.
15947 for (unsigned i = 0; i < Factor; i++) {
15948 Value *Shuffle;
15949 unsigned IdxI = StoreCount * LaneLen * Factor + i;
15950 if (Mask[IdxI] >= 0) {
15951 Shuffle = Builder.CreateShuffleVector(
15952 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
15953 } else {
15954 unsigned StartMask = 0;
15955 for (unsigned j = 1; j < LaneLen; j++) {
15956 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
15957 if (Mask[IdxJ] >= 0) {
15958 StartMask = Mask[IdxJ] - j;
15959 break;
15960 }
15961 }
15962 // Note: Filling undef gaps with random elements is ok, since
15963 // those elements were being written anyway (with undefs).
15964 // In the case of all undefs we're defaulting to using elems from 0
15965 // Note: StartMask cannot be negative, it's checked in
15966 // isReInterleaveMask
15967 Shuffle = Builder.CreateShuffleVector(
15968 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
15969 }
15970
15971 if (UseScalable)
15972 Shuffle = Builder.CreateInsertVector(
15973 STVTy, UndefValue::get(STVTy), Shuffle,
15974 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
15975
15976 Ops.push_back(Shuffle);
15977 }
15978
15979 if (UseScalable)
15980 Ops.push_back(PTrue);
15981
15982 // If we generating more than one store, we compute the base address of
15983 // subsequent stores as an offset from the previous.
15984 if (StoreCount > 0)
15985 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
15986 BaseAddr, LaneLen * Factor);
15987
15988 Ops.push_back(BaseAddr);
15989 Builder.CreateCall(StNFunc, Ops);
15990 }
15991 return true;
15992}
15993
15995 IntrinsicInst *DI, LoadInst *LI) const {
15996 // Only deinterleave2 supported at present.
15997 if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
15998 return false;
15999
16000 // Only a factor of 2 supported at present.
16001 const unsigned Factor = 2;
16002
16003 VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16004 const DataLayout &DL = DI->getModule()->getDataLayout();
16005 bool UseScalable;
16006 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16007 return false;
16008
16009 // TODO: Add support for using SVE instructions with fixed types later, using
16010 // the code from lowerInterleavedLoad to obtain the correct container type.
16011 if (UseScalable && !VTy->isScalableTy())
16012 return false;
16013
16014 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16015
16016 VectorType *LdTy =
16018 VTy->getElementCount().divideCoefficientBy(NumLoads));
16019
16020 Type *PtrTy = LI->getPointerOperandType();
16021 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
16022 UseScalable, LdTy, PtrTy);
16023
16024 IRBuilder<> Builder(LI);
16025
16026 Value *Pred = nullptr;
16027 if (UseScalable)
16028 Pred =
16029 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
16030
16031 Value *BaseAddr = LI->getPointerOperand();
16032 Value *Result;
16033 if (NumLoads > 1) {
16034 Value *Left = PoisonValue::get(VTy);
16036
16037 for (unsigned I = 0; I < NumLoads; ++I) {
16038 Value *Offset = Builder.getInt64(I * Factor);
16039
16040 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
16041 Value *LdN = nullptr;
16042 if (UseScalable)
16043 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
16044 else
16045 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16046
16047 Value *Idx =
16048 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16049 Left = Builder.CreateInsertVector(
16050 VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16051 Right = Builder.CreateInsertVector(
16052 VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16053 }
16054
16055 Result = PoisonValue::get(DI->getType());
16056 Result = Builder.CreateInsertValue(Result, Left, 0);
16057 Result = Builder.CreateInsertValue(Result, Right, 1);
16058 } else {
16059 if (UseScalable)
16060 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16061 else
16062 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16063 }
16064
16065 DI->replaceAllUsesWith(Result);
16066 return true;
16067}
16068
16070 IntrinsicInst *II, StoreInst *SI) const {
16071 // Only interleave2 supported at present.
16072 if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
16073 return false;
16074
16075 // Only a factor of 2 supported at present.
16076 const unsigned Factor = 2;
16077
16078 VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
16079 const DataLayout &DL = II->getModule()->getDataLayout();
16080 bool UseScalable;
16081 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16082 return false;
16083
16084 // TODO: Add support for using SVE instructions with fixed types later, using
16085 // the code from lowerInterleavedStore to obtain the correct container type.
16086 if (UseScalable && !VTy->isScalableTy())
16087 return false;
16088
16089 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
16090
16091 VectorType *StTy =
16093 VTy->getElementCount().divideCoefficientBy(NumStores));
16094
16095 Type *PtrTy = SI->getPointerOperandType();
16096 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16097 UseScalable, StTy, PtrTy);
16098
16099 IRBuilder<> Builder(SI);
16100
16101 Value *BaseAddr = SI->getPointerOperand();
16102 Value *Pred = nullptr;
16103
16104 if (UseScalable)
16105 Pred =
16106 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
16107
16108 Value *L = II->getOperand(0);
16109 Value *R = II->getOperand(1);
16110
16111 for (unsigned I = 0; I < NumStores; ++I) {
16112 Value *Address = BaseAddr;
16113 if (NumStores > 1) {
16114 Value *Offset = Builder.getInt64(I * Factor);
16115 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16116
16117 Value *Idx =
16118 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16119 L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16120 R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16121 }
16122
16123 if (UseScalable)
16124 Builder.CreateCall(StNFunc, {L, R, Pred, Address});
16125 else
16126 Builder.CreateCall(StNFunc, {L, R, Address});
16127 }
16128
16129 return true;
16130}
16131
16133 const MemOp &Op, const AttributeList &FuncAttributes) const {
16134 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16135 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16136 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16137 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16138 // taken one instruction to materialize the v2i64 zero and one store (with
16139 // restrictive addressing mode). Just do i64 stores.
16140 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16141 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16142 if (Op.isAligned(AlignCheck))
16143 return true;
16144 unsigned Fast;
16145 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16147 Fast;
16148 };
16149
16150 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16151 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
16152 return MVT::v16i8;
16153 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16154 return MVT::f128;
16155 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16156 return MVT::i64;
16157 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16158 return MVT::i32;
16159 return MVT::Other;
16160}
16161
16163 const MemOp &Op, const AttributeList &FuncAttributes) const {
16164 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16165 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16166 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16167 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16168 // taken one instruction to materialize the v2i64 zero and one store (with
16169 // restrictive addressing mode). Just do i64 stores.
16170 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16171 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16172 if (Op.isAligned(AlignCheck))
16173 return true;
16174 unsigned Fast;
16175 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16177 Fast;
16178 };
16179
16180 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16181 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
16182 return LLT::fixed_vector(2, 64);
16183 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16184 return LLT::scalar(128);
16185 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16186 return LLT::scalar(64);
16187 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16188 return LLT::scalar(32);
16189 return LLT();
16190}
16191
16192// 12-bit optionally shifted immediates are legal for adds.
16194 if (Immed == std::numeric_limits<int64_t>::min()) {
16195 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
16196 << ": avoid UB for INT64_MIN\n");
16197 return false;
16198 }
16199 // Same encoding for add/sub, just flip the sign.
16200 Immed = std::abs(Immed);
16201 bool IsLegal = ((Immed >> 12) == 0 ||
16202 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
16203 LLVM_DEBUG(dbgs() << "Is " << Immed
16204 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
16205 return IsLegal;
16206}
16207
16208// Return false to prevent folding
16209// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
16210// if the folding leads to worse code.
16212 SDValue AddNode, SDValue ConstNode) const {
16213 // Let the DAGCombiner decide for vector types and large types.
16214 const EVT VT = AddNode.getValueType();
16215 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
16216 return true;
16217
16218 // It is worse if c1 is legal add immediate, while c1*c2 is not
16219 // and has to be composed by at least two instructions.
16220 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
16221 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
16222 const int64_t C1 = C1Node->getSExtValue();
16223 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
16225 return true;
16227 // Adapt to the width of a register.
16228 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
16230 if (Insn.size() > 1)
16231 return false;
16232
16233 // Default to true and let the DAGCombiner decide.
16234 return true;
16235}
16236
16237// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
16238// immediates is the same as for an add or a sub.
16240 return isLegalAddImmediate(Immed);
16241}
16242
16243/// isLegalAddressingMode - Return true if the addressing mode represented
16244/// by AM is legal for this target, for a load/store of the specified type.
16246 const AddrMode &AMode, Type *Ty,
16247 unsigned AS, Instruction *I) const {
16248 // AArch64 has five basic addressing modes:
16249 // reg
16250 // reg + 9-bit signed offset
16251 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
16252 // reg1 + reg2
16253 // reg + SIZE_IN_BYTES * reg
16254
16255 // No global is ever allowed as a base.
16256 if (AMode.BaseGV)
16257 return false;
16258
16259 // No reg+reg+imm addressing.
16260 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
16261 return false;
16262
16263 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
16264 // `2*ScaledReg` into `BaseReg + ScaledReg`
16265 AddrMode AM = AMode;
16266 if (AM.Scale && !AM.HasBaseReg) {
16267 if (AM.Scale == 1) {
16268 AM.HasBaseReg = true;
16269 AM.Scale = 0;
16270 } else if (AM.Scale == 2) {
16271 AM.HasBaseReg = true;
16272 AM.Scale = 1;
16273 } else {
16274 return false;
16275 }
16276 }
16277
16278 // A base register is required in all addressing modes.
16279 if (!AM.HasBaseReg)
16280 return false;
16281
16282 if (Ty->isScalableTy()) {
16283 if (isa<ScalableVectorType>(Ty)) {
16284 uint64_t VecElemNumBytes =
16285 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
16286 return AM.HasBaseReg && !AM.BaseOffs &&
16287 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
16288 }
16289
16290 return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
16291 }
16292
16293 // check reg + imm case:
16294 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
16295 uint64_t NumBytes = 0;
16296 if (Ty->isSized()) {
16297 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
16298 NumBytes = NumBits / 8;
16299 if (!isPowerOf2_64(NumBits))
16300 NumBytes = 0;
16301 }
16302
16303 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
16304 AM.Scale);
16305}
16306
16307// Check whether the 2 offsets belong to the same imm24 range, and their high
16308// 12bits are same, then their high part can be decoded with the offset of add.
16309int64_t
16311 int64_t MaxOffset) const {
16312 int64_t HighPart = MinOffset & ~0xfffULL;
16313 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
16314 // Rebase the value to an integer multiple of imm12.
16315 return HighPart;
16316 }
16317
16318 return 0;
16319}
16320
16322 // Consider splitting large offset of struct or array.
16323 return true;
16324}
16325
16327 const MachineFunction &MF, EVT VT) const {
16328 VT = VT.getScalarType();
16329
16330 if (!VT.isSimple())
16331 return false;
16332
16333 switch (VT.getSimpleVT().SimpleTy) {
16334 case MVT::f16:
16335 return Subtarget->hasFullFP16();
16336 case MVT::f32:
16337 case MVT::f64:
16338 return true;
16339 default:
16340 break;
16341 }
16342
16343 return false;
16344}
16345
16347 Type *Ty) const {
16348 switch (Ty->getScalarType()->getTypeID()) {
16349 case Type::FloatTyID:
16350 case Type::DoubleTyID:
16351 return true;
16352 default:
16353 return false;
16354 }
16355}
16356
16358 EVT VT, CodeGenOptLevel OptLevel) const {
16359 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
16361}
16362
16363const MCPhysReg *
16365 // LR is a callee-save register, but we must treat it as clobbered by any call
16366 // site. Hence we include LR in the scratch registers, which are in turn added
16367 // as implicit-defs for stackmaps and patchpoints.
16368 static const MCPhysReg ScratchRegs[] = {
16369 AArch64::X16, AArch64::X17, AArch64::LR, 0
16370 };
16371 return ScratchRegs;
16372}
16373
16375 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
16376 return RCRegs;
16377}
16378
16379bool
16381 CombineLevel Level) const {
16382 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
16383 N->getOpcode() == ISD::SRL) &&
16384 "Expected shift op");
16385
16386 SDValue ShiftLHS = N->getOperand(0);
16387 EVT VT = N->getValueType(0);
16388
16389 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
16390 // combine it with shift 'N' to let it be lowered to UBFX except:
16391 // ((x >> C) & mask) << C.
16392 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
16393 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
16394 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
16395 if (isMask_64(TruncMask)) {
16396 SDValue AndLHS = ShiftLHS.getOperand(0);
16397 if (AndLHS.getOpcode() == ISD::SRL) {
16398 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
16399 if (N->getOpcode() == ISD::SHL)
16400 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
16401 return SRLC->getZExtValue() == SHLC->getZExtValue();
16402 return false;
16403 }
16404 }
16405 }
16406 }
16407 return true;
16408}
16409
16411 const SDNode *N) const {
16412 assert(N->getOpcode() == ISD::XOR &&
16413 (N->getOperand(0).getOpcode() == ISD::SHL ||
16414 N->getOperand(0).getOpcode() == ISD::SRL) &&
16415 "Expected XOR(SHIFT) pattern");
16416
16417 // Only commute if the entire NOT mask is a hidden shifted mask.
16418 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
16419 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16420 if (XorC && ShiftC) {
16421 unsigned MaskIdx, MaskLen;
16422 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
16423 unsigned ShiftAmt = ShiftC->getZExtValue();
16424 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
16425 if (N->getOperand(0).getOpcode() == ISD::SHL)
16426 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
16427 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
16428 }
16429 }
16430
16431 return false;
16432}
16433
16435 const SDNode *N, CombineLevel Level) const {
16436 assert(((N->getOpcode() == ISD::SHL &&
16437 N->getOperand(0).getOpcode() == ISD::SRL) ||
16438 (N->getOpcode() == ISD::SRL &&
16439 N->getOperand(0).getOpcode() == ISD::SHL)) &&
16440 "Expected shift-shift mask");
16441 // Don't allow multiuse shift folding with the same shift amount.
16442 if (!N->getOperand(0)->hasOneUse())
16443 return false;
16444
16445 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
16446 EVT VT = N->getValueType(0);
16447 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
16448 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16449 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
16450 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
16451 }
16452
16453 return true;
16454}
16455
16457 unsigned BinOpcode, EVT VT) const {
16458 return VT.isScalableVector() && isTypeLegal(VT);
16459}
16460
16462 Type *Ty) const {
16463 assert(Ty->isIntegerTy());
16464
16465 unsigned BitSize = Ty->getPrimitiveSizeInBits();
16466 if (BitSize == 0)
16467 return false;
16468
16469 int64_t Val = Imm.getSExtValue();
16470 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
16471 return true;
16472
16473 if ((int64_t)Val < 0)
16474 Val = ~Val;
16475 if (BitSize == 32)
16476 Val &= (1LL << 32) - 1;
16477
16478 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
16479 // MOVZ is free so return true for one or fewer MOVK.
16480 return Shift < 3;
16481}
16482
16484 unsigned Index) const {
16486 return false;
16487
16488 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
16489}
16490
16491/// Turn vector tests of the signbit in the form of:
16492/// xor (sra X, elt_size(X)-1), -1
16493/// into:
16494/// cmge X, X, #0
16496 const AArch64Subtarget *Subtarget) {
16497 EVT VT = N->getValueType(0);
16498 if (!Subtarget->hasNEON() || !VT.isVector())
16499 return SDValue();
16500
16501 // There must be a shift right algebraic before the xor, and the xor must be a
16502 // 'not' operation.
16503 SDValue Shift = N->getOperand(0);
16504 SDValue Ones = N->getOperand(1);
16505 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
16507 return SDValue();
16508
16509 // The shift should be smearing the sign bit across each vector element.
16510 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
16511 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
16512 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
16513 return SDValue();
16514
16515 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
16516}
16517
16518// Given a vecreduce_add node, detect the below pattern and convert it to the
16519// node sequence with UABDL, [S|U]ADB and UADDLP.
16520//
16521// i32 vecreduce_add(
16522// v16i32 abs(
16523// v16i32 sub(
16524// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
16525// =================>
16526// i32 vecreduce_add(
16527// v4i32 UADDLP(
16528// v8i16 add(
16529// v8i16 zext(
16530// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
16531// v8i16 zext(
16532// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
16534 SelectionDAG &DAG) {
16535 // Assumed i32 vecreduce_add
16536 if (N->getValueType(0) != MVT::i32)
16537 return SDValue();
16538
16539 SDValue VecReduceOp0 = N->getOperand(0);
16540 unsigned Opcode = VecReduceOp0.getOpcode();
16541 // Assumed v16i32 abs
16542 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
16543 return SDValue();
16544
16545 SDValue ABS = VecReduceOp0;
16546 // Assumed v16i32 sub
16547 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
16548 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
16549 return SDValue();
16550
16551 SDValue SUB = ABS->getOperand(0);
16552 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
16553 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
16554 // Assumed v16i32 type
16555 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
16556 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
16557 return SDValue();
16558
16559 // Assumed zext or sext
16560 bool IsZExt = false;
16561 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
16562 IsZExt = true;
16563 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
16564 IsZExt = false;
16565 } else
16566 return SDValue();
16567
16568 SDValue EXT0 = SUB->getOperand(0);
16569 SDValue EXT1 = SUB->getOperand(1);
16570 // Assumed zext's operand has v16i8 type
16571 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
16572 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
16573 return SDValue();
16574
16575 // Pattern is dectected. Let's convert it to sequence of nodes.
16576 SDLoc DL(N);
16577
16578 // First, create the node pattern of UABD/SABD.
16579 SDValue UABDHigh8Op0 =
16580 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
16581 DAG.getConstant(8, DL, MVT::i64));
16582 SDValue UABDHigh8Op1 =
16583 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
16584 DAG.getConstant(8, DL, MVT::i64));
16585 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
16586 UABDHigh8Op0, UABDHigh8Op1);
16587 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
16588
16589 // Second, create the node pattern of UABAL.
16590 SDValue UABDLo8Op0 =
16591 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
16592 DAG.getConstant(0, DL, MVT::i64));
16593 SDValue UABDLo8Op1 =
16594 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
16595 DAG.getConstant(0, DL, MVT::i64));
16596 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
16597 UABDLo8Op0, UABDLo8Op1);
16598 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
16599 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
16600
16601 // Third, create the node of UADDLP.
16602 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
16603
16604 // Fourth, create the node of VECREDUCE_ADD.
16605 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
16606}
16607
16608// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
16609// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
16610// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
16611// If we have vectors larger than v16i8 we extract v16i8 vectors,
16612// Follow the same steps above to get DOT instructions concatenate them
16613// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
16615 const AArch64Subtarget *ST) {
16616 if (!ST->hasDotProd())
16618
16619 SDValue Op0 = N->getOperand(0);
16620 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
16621 Op0.getValueType().getVectorElementType() != MVT::i32)
16622 return SDValue();
16623
16624 unsigned ExtOpcode = Op0.getOpcode();
16625 SDValue A = Op0;
16626 SDValue B;
16627 if (ExtOpcode == ISD::MUL) {
16628 A = Op0.getOperand(0);
16629 B = Op0.getOperand(1);
16630 if (A.getOpcode() != B.getOpcode() ||
16631 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
16632 return SDValue();
16633 ExtOpcode = A.getOpcode();
16634 }
16635 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
16636 return SDValue();
16637
16638 EVT Op0VT = A.getOperand(0).getValueType();
16639 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
16640 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
16641 if (!IsValidElementCount || !IsValidSize)
16642 return SDValue();
16643
16644 SDLoc DL(Op0);
16645 // For non-mla reductions B can be set to 1. For MLA we take the operand of
16646 // the extend B.
16647 if (!B)
16648 B = DAG.getConstant(1, DL, Op0VT);
16649 else
16650 B = B.getOperand(0);
16651
16652 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
16653 unsigned NumOfVecReduce;
16654 EVT TargetType;
16655 if (IsMultipleOf16) {
16656 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
16657 TargetType = MVT::v4i32;
16658 } else {
16659 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
16660 TargetType = MVT::v2i32;
16661 }
16662 auto DotOpcode =
16664 // Handle the case where we need to generate only one Dot operation.
16665 if (NumOfVecReduce == 1) {
16666 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
16667 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
16668 A.getOperand(0), B);
16669 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
16670 }
16671 // Generate Dot instructions that are multiple of 16.
16672 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
16673 SmallVector<SDValue, 4> SDotVec16;
16674 unsigned I = 0;
16675 for (; I < VecReduce16Num; I += 1) {
16676 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
16677 SDValue Op0 =
16678 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
16679 DAG.getConstant(I * 16, DL, MVT::i64));
16680 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
16681 DAG.getConstant(I * 16, DL, MVT::i64));
16682 SDValue Dot =
16683 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
16684 SDotVec16.push_back(Dot);
16685 }
16686 // Concatenate dot operations.
16687 EVT SDot16EVT =
16688 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
16689 SDValue ConcatSDot16 =
16690 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
16691 SDValue VecReduceAdd16 =
16692 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
16693 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
16694 if (VecReduce8Num == 0)
16695 return VecReduceAdd16;
16696
16697 // Generate the remainder Dot operation that is multiple of 8.
16698 SmallVector<SDValue, 4> SDotVec8;
16699 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
16700 SDValue Vec8Op0 =
16701 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
16702 DAG.getConstant(I * 16, DL, MVT::i64));
16703 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
16704 DAG.getConstant(I * 16, DL, MVT::i64));
16705 SDValue Dot =
16706 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
16707 SDValue VecReudceAdd8 =
16708 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
16709 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
16710 VecReudceAdd8);
16711}
16712
16713// Given an (integer) vecreduce, we know the order of the inputs does not
16714// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
16715// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
16716// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
16718 auto DetectAddExtract = [&](SDValue A) {
16719 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
16720 // UADDLP(x) if found.
16721 assert(A.getOpcode() == ISD::ADD);
16722 EVT VT = A.getValueType();
16723 SDValue Op0 = A.getOperand(0);
16724 SDValue Op1 = A.getOperand(1);
16725 if (Op0.getOpcode() != Op0.getOpcode() ||
16726 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
16727 Op0.getOpcode() != ISD::SIGN_EXTEND))
16728 return SDValue();
16729 SDValue Ext0 = Op0.getOperand(0);
16730 SDValue Ext1 = Op1.getOperand(0);
16731 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
16733 Ext0.getOperand(0) != Ext1.getOperand(0))
16734 return SDValue();
16735 // Check that the type is twice the add types, and the extract are from
16736 // upper/lower parts of the same source.
16738 VT.getVectorNumElements() * 2)
16739 return SDValue();
16740 if ((Ext0.getConstantOperandVal(1) != 0 ||
16742 (Ext1.getConstantOperandVal(1) != 0 ||
16744 return SDValue();
16745 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
16747 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
16748 };
16749
16750 if (SDValue R = DetectAddExtract(A))
16751 return R;
16752
16753 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
16754 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
16755 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
16756 A.getOperand(1));
16757 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
16758 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
16759 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
16760 A.getOperand(0));
16761 return SDValue();
16762}
16763
16764// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
16765// UADDLV(concat), where the concat represents the 64-bit zext sources.
16767 // Look for add(zext(64-bit source), zext(64-bit source)), returning
16768 // UADDLV(concat(zext, zext)) if found.
16769 assert(A.getOpcode() == ISD::ADD);
16770 EVT VT = A.getValueType();
16771 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
16772 return SDValue();
16773 SDValue Op0 = A.getOperand(0);
16774 SDValue Op1 = A.getOperand(1);
16775 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
16776 return SDValue();
16777 SDValue Ext0 = Op0.getOperand(0);
16778 SDValue Ext1 = Op1.getOperand(0);
16779 EVT ExtVT0 = Ext0.getValueType();
16780 EVT ExtVT1 = Ext1.getValueType();
16781 // Check zext VTs are the same and 64-bit length.
16782 if (ExtVT0 != ExtVT1 ||
16783 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
16784 return SDValue();
16785 // Get VT for concat of zext sources.
16786 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
16787 SDValue Concat =
16788 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
16789
16790 switch (VT.getSimpleVT().SimpleTy) {
16791 case MVT::v2i64:
16792 case MVT::v4i32:
16793 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
16794 case MVT::v8i16: {
16795 SDValue Uaddlv =
16796 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
16797 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
16798 }
16799 default:
16800 llvm_unreachable("Unhandled vector type");
16801 }
16802}
16803
16805 SDValue A = N->getOperand(0);
16806 if (A.getOpcode() == ISD::ADD) {
16807 if (SDValue R = performUADDVAddCombine(A, DAG))
16808 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
16809 else if (SDValue R = performUADDVZextCombine(A, DAG))
16810 return R;
16811 }
16812 return SDValue();
16813}
16814
16817 const AArch64Subtarget *Subtarget) {
16818 if (DCI.isBeforeLegalizeOps())
16819 return SDValue();
16820
16821 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
16822}
16823
16824SDValue
16825AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
16826 SelectionDAG &DAG,
16827 SmallVectorImpl<SDNode *> &Created) const {
16829 if (isIntDivCheap(N->getValueType(0), Attr))
16830 return SDValue(N,0); // Lower SDIV as SDIV
16831
16832 EVT VT = N->getValueType(0);
16833
16834 // For scalable and fixed types, mark them as cheap so we can handle it much
16835 // later. This allows us to handle larger than legal types.
16836 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
16837 return SDValue(N, 0);
16838
16839 // fold (sdiv X, pow2)
16840 if ((VT != MVT::i32 && VT != MVT::i64) ||
16841 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
16842 return SDValue();
16843
16844 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
16845}
16846
16847SDValue
16848AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
16849 SelectionDAG &DAG,
16850 SmallVectorImpl<SDNode *> &Created) const {
16852 if (isIntDivCheap(N->getValueType(0), Attr))
16853 return SDValue(N, 0); // Lower SREM as SREM
16854
16855 EVT VT = N->getValueType(0);
16856
16857 // For scalable and fixed types, mark them as cheap so we can handle it much
16858 // later. This allows us to handle larger than legal types.
16859 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
16860 return SDValue(N, 0);
16861
16862 // fold (srem X, pow2)
16863 if ((VT != MVT::i32 && VT != MVT::i64) ||
16864 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
16865 return SDValue();
16866
16867 unsigned Lg2 = Divisor.countr_zero();
16868 if (Lg2 == 0)
16869 return SDValue();
16870
16871 SDLoc DL(N);
16872 SDValue N0 = N->getOperand(0);
16873 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
16874 SDValue Zero = DAG.getConstant(0, DL, VT);
16875 SDValue CCVal, CSNeg;
16876 if (Lg2 == 1) {
16877 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
16878 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
16879 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
16880
16881 Created.push_back(Cmp.getNode());
16882 Created.push_back(And.getNode());
16883 } else {
16884 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
16885 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
16886
16887 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
16888 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
16889 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
16890 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
16891 Negs.getValue(1));
16892
16893 Created.push_back(Negs.getNode());
16894 Created.push_back(AndPos.getNode());
16895 Created.push_back(AndNeg.getNode());
16896 }
16897
16898 return CSNeg;
16899}
16900
16901static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
16902 switch(getIntrinsicID(S.getNode())) {
16903 default:
16904 break;
16905 case Intrinsic::aarch64_sve_cntb:
16906 return 8;
16907 case Intrinsic::aarch64_sve_cnth:
16908 return 16;
16909 case Intrinsic::aarch64_sve_cntw:
16910 return 32;
16911 case Intrinsic::aarch64_sve_cntd:
16912 return 64;
16913 }
16914 return {};
16915}
16916
16917/// Calculates what the pre-extend type is, based on the extension
16918/// operation node provided by \p Extend.
16919///
16920/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
16921/// pre-extend type is pulled directly from the operand, while other extend
16922/// operations need a bit more inspection to get this information.
16923///
16924/// \param Extend The SDNode from the DAG that represents the extend operation
16925///
16926/// \returns The type representing the \p Extend source type, or \p MVT::Other
16927/// if no valid type can be determined
16929 switch (Extend.getOpcode()) {
16930 case ISD::SIGN_EXTEND:
16931 case ISD::ZERO_EXTEND:
16932 return Extend.getOperand(0).getValueType();
16933 case ISD::AssertSext:
16934 case ISD::AssertZext:
16936 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
16937 if (!TypeNode)
16938 return MVT::Other;
16939 return TypeNode->getVT();
16940 }
16941 case ISD::AND: {
16943 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
16944 if (!Constant)
16945 return MVT::Other;
16946
16947 uint32_t Mask = Constant->getZExtValue();
16948
16949 if (Mask == UCHAR_MAX)
16950 return MVT::i8;
16951 else if (Mask == USHRT_MAX)
16952 return MVT::i16;
16953 else if (Mask == UINT_MAX)
16954 return MVT::i32;
16955
16956 return MVT::Other;
16957 }
16958 default:
16959 return MVT::Other;
16960 }
16961}
16962
16963/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
16964/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
16965/// SExt/ZExt rather than the scalar SExt/ZExt
16967 EVT VT = BV.getValueType();
16968 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
16970 return SDValue();
16971
16972 // Use the first item in the buildvector/shuffle to get the size of the
16973 // extend, and make sure it looks valid.
16974 SDValue Extend = BV->getOperand(0);
16975 unsigned ExtendOpcode = Extend.getOpcode();
16976 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
16977 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
16978 ExtendOpcode == ISD::AssertSext;
16979 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
16980 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
16981 return SDValue();
16982 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
16983 // calculatePreExtendType will work without issue.
16984 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
16985 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
16986 return SDValue();
16987
16988 // Restrict valid pre-extend data type
16989 EVT PreExtendType = calculatePreExtendType(Extend);
16990 if (PreExtendType == MVT::Other ||
16991 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
16992 return SDValue();
16993
16994 // Make sure all other operands are equally extended
16995 for (SDValue Op : drop_begin(BV->ops())) {
16996 if (Op.isUndef())
16997 continue;
16998 unsigned Opc = Op.getOpcode();
16999 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
17000 Opc == ISD::AssertSext;
17001 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
17002 return SDValue();
17003 }
17004
17005 SDValue NBV;
17006 SDLoc DL(BV);
17007 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
17008 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
17009 EVT PreExtendLegalType =
17010 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
17012 for (SDValue Op : BV->ops())
17013 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
17014 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
17015 PreExtendLegalType));
17016 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
17017 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17018 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
17019 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
17020 BV.getOperand(1).isUndef()
17021 ? DAG.getUNDEF(PreExtendVT)
17022 : BV.getOperand(1).getOperand(0),
17023 cast<ShuffleVectorSDNode>(BV)->getMask());
17024 }
17025 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
17026}
17027
17028/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17029/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17031 // If the value type isn't a vector, none of the operands are going to be dups
17032 EVT VT = Mul->getValueType(0);
17033 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17034 return SDValue();
17035
17036 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
17037 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
17038
17039 // Neither operands have been changed, don't make any further changes
17040 if (!Op0 && !Op1)
17041 return SDValue();
17042
17043 SDLoc DL(Mul);
17044 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
17045 Op1 ? Op1 : Mul->getOperand(1));
17046}
17047
17048// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17049// Same for other types with equivalent constants.
17051 EVT VT = N->getValueType(0);
17052 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17053 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17054 return SDValue();
17055 if (N->getOperand(0).getOpcode() != ISD::AND ||
17056 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
17057 return SDValue();
17058
17059 SDValue And = N->getOperand(0);
17060 SDValue Srl = And.getOperand(0);
17061
17062 APInt V1, V2, V3;
17063 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
17064 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
17066 return SDValue();
17067
17068 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
17069 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
17070 V3 != (HalfSize - 1))
17071 return SDValue();
17072
17073 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17074 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
17075 VT.getVectorElementCount() * 2);
17076
17077 SDLoc DL(N);
17078 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
17079 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
17080 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
17081}
17082
17085 const AArch64Subtarget *Subtarget) {
17086
17087 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
17088 return Ext;
17090 return Ext;
17091
17092 if (DCI.isBeforeLegalizeOps())
17093 return SDValue();
17094
17095 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
17096 // and in MachineCombiner pass, add+mul will be combined into madd.
17097 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
17098 SDLoc DL(N);
17099 EVT VT = N->getValueType(0);
17100 SDValue N0 = N->getOperand(0);
17101 SDValue N1 = N->getOperand(1);
17102 SDValue MulOper;
17103 unsigned AddSubOpc;
17104
17105 auto IsAddSubWith1 = [&](SDValue V) -> bool {
17106 AddSubOpc = V->getOpcode();
17107 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
17108 SDValue Opnd = V->getOperand(1);
17109 MulOper = V->getOperand(0);
17110 if (AddSubOpc == ISD::SUB)
17111 std::swap(Opnd, MulOper);
17112 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
17113 return C->isOne();
17114 }
17115 return false;
17116 };
17117
17118 if (IsAddSubWith1(N0)) {
17119 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
17120 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
17121 }
17122
17123 if (IsAddSubWith1(N1)) {
17124 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
17125 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
17126 }
17127
17128 // The below optimizations require a constant RHS.
17129 if (!isa<ConstantSDNode>(N1))
17130 return SDValue();
17131
17132 ConstantSDNode *C = cast<ConstantSDNode>(N1);
17133 const APInt &ConstValue = C->getAPIntValue();
17134
17135 // Allow the scaling to be folded into the `cnt` instruction by preventing
17136 // the scaling to be obscured here. This makes it easier to pattern match.
17137 if (IsSVECntIntrinsic(N0) ||
17138 (N0->getOpcode() == ISD::TRUNCATE &&
17139 (IsSVECntIntrinsic(N0->getOperand(0)))))
17140 if (ConstValue.sge(1) && ConstValue.sle(16))
17141 return SDValue();
17142
17143 // Multiplication of a power of two plus/minus one can be done more
17144 // cheaply as shift+add/sub. For now, this is true unilaterally. If
17145 // future CPUs have a cheaper MADD instruction, this may need to be
17146 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
17147 // 64-bit is 5 cycles, so this is always a win.
17148 // More aggressively, some multiplications N0 * C can be lowered to
17149 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
17150 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
17151 // TODO: lower more cases.
17152
17153 // TrailingZeroes is used to test if the mul can be lowered to
17154 // shift+add+shift.
17155 unsigned TrailingZeroes = ConstValue.countr_zero();
17156 if (TrailingZeroes) {
17157 // Conservatively do not lower to shift+add+shift if the mul might be
17158 // folded into smul or umul.
17159 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
17160 isZeroExtended(N0, DAG)))
17161 return SDValue();
17162 // Conservatively do not lower to shift+add+shift if the mul might be
17163 // folded into madd or msub.
17164 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
17165 N->use_begin()->getOpcode() == ISD::SUB))
17166 return SDValue();
17167 }
17168 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
17169 // and shift+add+shift.
17170 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
17171 unsigned ShiftAmt;
17172
17173 auto Shl = [&](SDValue N0, unsigned N1) {
17174 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
17175 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
17176 };
17177 auto Add = [&](SDValue N0, SDValue N1) {
17178 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
17179 };
17180 auto Sub = [&](SDValue N0, SDValue N1) {
17181 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
17182 };
17183 auto Negate = [&](SDValue N) {
17184 SDValue Zero = DAG.getConstant(0, DL, VT);
17185 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
17186 };
17187
17188 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
17189 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
17190 // the (2^N - 1) can't be execused via a single instruction.
17191 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
17192 unsigned BitWidth = C.getBitWidth();
17193 for (unsigned i = 1; i < BitWidth / 2; i++) {
17194 APInt Rem;
17195 APInt X(BitWidth, (1 << i) + 1);
17196 APInt::sdivrem(C, X, N, Rem);
17197 APInt NVMinus1 = N - 1;
17198 if (Rem == 0 && NVMinus1.isPowerOf2()) {
17199 M = X;
17200 return true;
17201 }
17202 }
17203 return false;
17204 };
17205
17206 if (ConstValue.isNonNegative()) {
17207 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
17208 // (mul x, 2^N - 1) => (sub (shl x, N), x)
17209 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
17210 // (mul x, (2^M + 1) * (2^N + 1))
17211 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
17212 APInt SCVMinus1 = ShiftedConstValue - 1;
17213 APInt SCVPlus1 = ShiftedConstValue + 1;
17214 APInt CVPlus1 = ConstValue + 1;
17215 APInt CVM, CVN;
17216 if (SCVMinus1.isPowerOf2()) {
17217 ShiftAmt = SCVMinus1.logBase2();
17218 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
17219 } else if (CVPlus1.isPowerOf2()) {
17220 ShiftAmt = CVPlus1.logBase2();
17221 return Sub(Shl(N0, ShiftAmt), N0);
17222 } else if (SCVPlus1.isPowerOf2()) {
17223 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17224 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
17225 } else if (Subtarget->hasALULSLFast() &&
17226 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
17227 APInt CVMMinus1 = CVM - 1;
17228 APInt CVNMinus1 = CVN - 1;
17229 unsigned ShiftM1 = CVMMinus1.logBase2();
17230 unsigned ShiftN1 = CVNMinus1.logBase2();
17231 // LSLFast implicate that Shifts <= 3 places are fast
17232 if (ShiftM1 <= 3 && ShiftN1 <= 3) {
17233 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
17234 return Add(Shl(MVal, ShiftN1), MVal);
17235 }
17236 }
17237 } else {
17238 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
17239 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
17240 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
17241 APInt SCVPlus1 = -ShiftedConstValue + 1;
17242 APInt CVNegPlus1 = -ConstValue + 1;
17243 APInt CVNegMinus1 = -ConstValue - 1;
17244 if (CVNegPlus1.isPowerOf2()) {
17245 ShiftAmt = CVNegPlus1.logBase2();
17246 return Sub(N0, Shl(N0, ShiftAmt));
17247 } else if (CVNegMinus1.isPowerOf2()) {
17248 ShiftAmt = CVNegMinus1.logBase2();
17249 return Negate(Add(Shl(N0, ShiftAmt), N0));
17250 } else if (SCVPlus1.isPowerOf2()) {
17251 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17252 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
17253 }
17254 }
17255
17256 return SDValue();
17257}
17258
17260 SelectionDAG &DAG) {
17261 // Take advantage of vector comparisons producing 0 or -1 in each lane to
17262 // optimize away operation when it's from a constant.
17263 //
17264 // The general transformation is:
17265 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
17266 // AND(VECTOR_CMP(x,y), constant2)
17267 // constant2 = UNARYOP(constant)
17268
17269 // Early exit if this isn't a vector operation, the operand of the
17270 // unary operation isn't a bitwise AND, or if the sizes of the operations
17271 // aren't the same.
17272 EVT VT = N->getValueType(0);
17273 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
17274 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
17275 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
17276 return SDValue();
17277
17278 // Now check that the other operand of the AND is a constant. We could
17279 // make the transformation for non-constant splats as well, but it's unclear
17280 // that would be a benefit as it would not eliminate any operations, just
17281 // perform one more step in scalar code before moving to the vector unit.
17282 if (BuildVectorSDNode *BV =
17283 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
17284 // Bail out if the vector isn't a constant.
17285 if (!BV->isConstant())
17286 return SDValue();
17287
17288 // Everything checks out. Build up the new and improved node.
17289 SDLoc DL(N);
17290 EVT IntVT = BV->getValueType(0);
17291 // Create a new constant of the appropriate type for the transformed
17292 // DAG.
17293 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
17294 // The AND node needs bitcasts to/from an integer vector type around it.
17295 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
17296 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
17297 N->getOperand(0)->getOperand(0), MaskConst);
17298 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
17299 return Res;
17300 }
17301
17302 return SDValue();
17303}
17304
17306 const AArch64Subtarget *Subtarget) {
17307 // First try to optimize away the conversion when it's conditionally from
17308 // a constant. Vectors only.
17310 return Res;
17311
17312 EVT VT = N->getValueType(0);
17313 if (VT != MVT::f32 && VT != MVT::f64)
17314 return SDValue();
17315
17316 // Only optimize when the source and destination types have the same width.
17317 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
17318 return SDValue();
17319
17320 // If the result of an integer load is only used by an integer-to-float
17321 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
17322 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
17323 SDValue N0 = N->getOperand(0);
17324 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
17325 N0.hasOneUse() &&
17326 // Do not change the width of a volatile load.
17327 !cast<LoadSDNode>(N0)->isVolatile()) {
17328 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
17329 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
17330 LN0->getPointerInfo(), LN0->getAlign(),
17331 LN0->getMemOperand()->getFlags());
17332
17333 // Make sure successors of the original load stay after it by updating them
17334 // to use the new Chain.
17335 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
17336
17337 unsigned Opcode =
17339 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
17340 }
17341
17342 return SDValue();
17343}
17344
17345/// Fold a floating-point multiply by power of two into floating-point to
17346/// fixed-point conversion.
17349 const AArch64Subtarget *Subtarget) {
17350 if (!Subtarget->isNeonAvailable())
17351 return SDValue();
17352
17353 if (!N->getValueType(0).isSimple())
17354 return SDValue();
17355
17356 SDValue Op = N->getOperand(0);
17357 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
17358 return SDValue();
17359
17360 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
17361 return SDValue();
17362
17363 SDValue ConstVec = Op->getOperand(1);
17364 if (!isa<BuildVectorSDNode>(ConstVec))
17365 return SDValue();
17366
17367 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17368 uint32_t FloatBits = FloatTy.getSizeInBits();
17369 if (FloatBits != 32 && FloatBits != 64 &&
17370 (FloatBits != 16 || !Subtarget->hasFullFP16()))
17371 return SDValue();
17372
17373 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
17374 uint32_t IntBits = IntTy.getSizeInBits();
17375 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17376 return SDValue();
17377
17378 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
17379 if (IntBits > FloatBits)
17380 return SDValue();
17381
17382 BitVector UndefElements;
17383 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17384 int32_t Bits = IntBits == 64 ? 64 : 32;
17385 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
17386 if (C == -1 || C == 0 || C > Bits)
17387 return SDValue();
17388
17389 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
17390 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
17391 return SDValue();
17392
17393 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
17394 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
17395 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
17396 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
17397 return SDValue();
17398 }
17399
17400 SDLoc DL(N);
17401 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
17402 N->getOpcode() == ISD::FP_TO_SINT_SAT);
17403 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
17404 : Intrinsic::aarch64_neon_vcvtfp2fxu;
17405 SDValue FixConv =
17407 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
17408 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
17409 // We can handle smaller integers by generating an extra trunc.
17410 if (IntBits < FloatBits)
17411 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
17412
17413 return FixConv;
17414}
17415
17416/// Fold a floating-point divide by power of two into fixed-point to
17417/// floating-point conversion.
17420 const AArch64Subtarget *Subtarget) {
17421 if (!Subtarget->hasNEON())
17422 return SDValue();
17423
17424 SDValue Op = N->getOperand(0);
17425 unsigned Opc = Op->getOpcode();
17426 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
17427 !Op.getOperand(0).getValueType().isSimple() ||
17428 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
17429 return SDValue();
17430
17431 SDValue ConstVec = N->getOperand(1);
17432 if (!isa<BuildVectorSDNode>(ConstVec))
17433 return SDValue();
17434
17435 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17436 int32_t IntBits = IntTy.getSizeInBits();
17437 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17438 return SDValue();
17439
17440 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17441 int32_t FloatBits = FloatTy.getSizeInBits();
17442 if (FloatBits != 32 && FloatBits != 64)
17443 return SDValue();
17444
17445 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
17446 if (IntBits > FloatBits)
17447 return SDValue();
17448
17449 BitVector UndefElements;
17450 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17451 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
17452 if (C == -1 || C == 0 || C > FloatBits)
17453 return SDValue();
17454
17455 MVT ResTy;
17456 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17457 switch (NumLanes) {
17458 default:
17459 return SDValue();
17460 case 2:
17461 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
17462 break;
17463 case 4:
17464 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
17465 break;
17466 }
17467
17468 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
17469 return SDValue();
17470
17471 SDLoc DL(N);
17472 SDValue ConvInput = Op.getOperand(0);
17473 bool IsSigned = Opc == ISD::SINT_TO_FP;
17474 if (IntBits < FloatBits)
17475 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17476 ResTy, ConvInput);
17477
17478 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
17479 : Intrinsic::aarch64_neon_vcvtfxu2fp;
17480 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17481 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17482 DAG.getConstant(C, DL, MVT::i32));
17483}
17484
17486 const AArch64TargetLowering &TLI) {
17487 EVT VT = N->getValueType(0);
17488 SelectionDAG &DAG = DCI.DAG;
17489 SDLoc DL(N);
17490
17491 if (!VT.isVector())
17492 return SDValue();
17493
17494 // The combining code currently only works for NEON vectors. In particular,
17495 // it does not work for SVE when dealing with vectors wider than 128 bits.
17496 // It also doesn't work for streaming mode because it causes generating
17497 // bsl instructions that are invalid in streaming mode.
17500 return SDValue();
17501
17502 SDValue N0 = N->getOperand(0);
17503 if (N0.getOpcode() != ISD::AND)
17504 return SDValue();
17505
17506 SDValue N1 = N->getOperand(1);
17507 if (N1.getOpcode() != ISD::AND)
17508 return SDValue();
17509
17510 // InstCombine does (not (neg a)) => (add a -1).
17511 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
17512 // Loop over all combinations of AND operands.
17513 for (int i = 1; i >= 0; --i) {
17514 for (int j = 1; j >= 0; --j) {
17515 SDValue O0 = N0->getOperand(i);
17516 SDValue O1 = N1->getOperand(j);
17517 SDValue Sub, Add, SubSibling, AddSibling;
17518
17519 // Find a SUB and an ADD operand, one from each AND.
17520 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
17521 Sub = O0;
17522 Add = O1;
17523 SubSibling = N0->getOperand(1 - i);
17524 AddSibling = N1->getOperand(1 - j);
17525 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
17526 Add = O0;
17527 Sub = O1;
17528 AddSibling = N0->getOperand(1 - i);
17529 SubSibling = N1->getOperand(1 - j);
17530 } else
17531 continue;
17532
17534 continue;
17535
17536 // Constant ones is always righthand operand of the Add.
17537 if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
17538 continue;
17539
17540 if (Sub.getOperand(1) != Add.getOperand(0))
17541 continue;
17542
17543 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
17544 }
17545 }
17546
17547 // (or (and a b) (and (not a) c)) => (bsl a b c)
17548 // We only have to look for constant vectors here since the general, variable
17549 // case can be handled in TableGen.
17550 unsigned Bits = VT.getScalarSizeInBits();
17551 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
17552 for (int i = 1; i >= 0; --i)
17553 for (int j = 1; j >= 0; --j) {
17554 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
17555 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
17556 if (!BVN0 || !BVN1)
17557 continue;
17558
17559 bool FoundMatch = true;
17560 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
17561 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
17562 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
17563 if (!CN0 || !CN1 ||
17564 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
17565 FoundMatch = false;
17566 break;
17567 }
17568 }
17569
17570 if (FoundMatch)
17571 return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
17572 N0->getOperand(1 - i), N1->getOperand(1 - j));
17573 }
17574
17575 return SDValue();
17576}
17577
17578// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
17579// convert to csel(ccmp(.., cc0)), depending on cc1:
17580
17581// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
17582// =>
17583// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
17584//
17585// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
17586// =>
17587// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
17589 EVT VT = N->getValueType(0);
17590 SDValue CSel0 = N->getOperand(0);
17591 SDValue CSel1 = N->getOperand(1);
17592
17593 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
17594 CSel1.getOpcode() != AArch64ISD::CSEL)
17595 return SDValue();
17596
17597 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
17598 return SDValue();
17599
17600 if (!isNullConstant(CSel0.getOperand(0)) ||
17601 !isOneConstant(CSel0.getOperand(1)) ||
17602 !isNullConstant(CSel1.getOperand(0)) ||
17603 !isOneConstant(CSel1.getOperand(1)))
17604 return SDValue();
17605
17606 SDValue Cmp0 = CSel0.getOperand(3);
17607 SDValue Cmp1 = CSel1.getOperand(3);
17610 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
17611 return SDValue();
17612 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
17613 Cmp0.getOpcode() == AArch64ISD::SUBS) {
17614 std::swap(Cmp0, Cmp1);
17615 std::swap(CC0, CC1);
17616 }
17617
17618 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
17619 return SDValue();
17620
17621 SDLoc DL(N);
17622 SDValue CCmp, Condition;
17623 unsigned NZCV;
17624
17625 if (N->getOpcode() == ISD::AND) {
17627 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
17629 } else {
17631 Condition = DAG.getConstant(CC0, DL, MVT_CC);
17633 }
17634
17635 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
17636
17637 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
17638 if (Op1 && Op1->getAPIntValue().isNegative() &&
17639 Op1->getAPIntValue().sgt(-32)) {
17640 // CCMP accept the constant int the range [0, 31]
17641 // if the Op1 is a constant in the range [-31, -1], we
17642 // can select to CCMN to avoid the extra mov
17643 SDValue AbsOp1 =
17644 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
17645 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
17646 NZCVOp, Condition, Cmp0);
17647 } else {
17648 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
17649 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
17650 }
17651 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
17652 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
17653 CCmp);
17654}
17655
17657 const AArch64Subtarget *Subtarget,
17658 const AArch64TargetLowering &TLI) {
17659 SelectionDAG &DAG = DCI.DAG;
17660 EVT VT = N->getValueType(0);
17661
17662 if (SDValue R = performANDORCSELCombine(N, DAG))
17663 return R;
17664
17665 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
17666 return SDValue();
17667
17668 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
17669 return Res;
17670
17671 return SDValue();
17672}
17673
17675 if (!MemVT.getVectorElementType().isSimple())
17676 return false;
17677
17678 uint64_t MaskForTy = 0ull;
17679 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
17680 case MVT::i8:
17681 MaskForTy = 0xffull;
17682 break;
17683 case MVT::i16:
17684 MaskForTy = 0xffffull;
17685 break;
17686 case MVT::i32:
17687 MaskForTy = 0xffffffffull;
17688 break;
17689 default:
17690 return false;
17691 break;
17692 }
17693
17694 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
17695 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
17696 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
17697
17698 return false;
17699}
17700
17702 SDValue LeafOp = SDValue(N, 0);
17703 SDValue Op = N->getOperand(0);
17704 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
17705 LeafOp.getValueType() != Op.getValueType())
17706 Op = Op->getOperand(0);
17707 if (LeafOp.getValueType() == Op.getValueType())
17708 return Op;
17709 return SDValue();
17710}
17711
17714 if (DCI.isBeforeLegalizeOps())
17715 return SDValue();
17716
17717 SelectionDAG &DAG = DCI.DAG;
17718 SDValue Src = N->getOperand(0);
17719 unsigned Opc = Src->getOpcode();
17720
17721 // Zero/any extend of an unsigned unpack
17722 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
17723 SDValue UnpkOp = Src->getOperand(0);
17724 SDValue Dup = N->getOperand(1);
17725
17726 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
17727 return SDValue();
17728
17729 SDLoc DL(N);
17730 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
17731 if (!C)
17732 return SDValue();
17733
17734 uint64_t ExtVal = C->getZExtValue();
17735
17736 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
17737 return ((ExtVal == 0xFF && VT == MVT::i8) ||
17738 (ExtVal == 0xFFFF && VT == MVT::i16) ||
17739 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
17740 };
17741
17742 // If the mask is fully covered by the unpack, we don't need to push
17743 // a new AND onto the operand
17744 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
17745 if (MaskAndTypeMatch(EltTy))
17746 return Src;
17747
17748 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
17749 // to see if the mask is all-ones of size MemTy.
17750 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
17751 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
17752 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
17753 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
17754 if (MaskAndTypeMatch(EltTy))
17755 return Src;
17756 }
17757
17758 // Truncate to prevent a DUP with an over wide constant
17759 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
17760
17761 // Otherwise, make sure we propagate the AND to the operand
17762 // of the unpack
17763 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
17764 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
17765
17766 SDValue And = DAG.getNode(ISD::AND, DL,
17767 UnpkOp->getValueType(0), UnpkOp, Dup);
17768
17769 return DAG.getNode(Opc, DL, N->getValueType(0), And);
17770 }
17771
17772 // If both sides of AND operations are i1 splat_vectors then
17773 // we can produce just i1 splat_vector as the result.
17774 if (isAllActivePredicate(DAG, N->getOperand(0)))
17775 return N->getOperand(1);
17776 if (isAllActivePredicate(DAG, N->getOperand(1)))
17777 return N->getOperand(0);
17778
17780 return SDValue();
17781
17782 SDValue Mask = N->getOperand(1);
17783
17784 if (!Src.hasOneUse())
17785 return SDValue();
17786
17787 EVT MemVT;
17788
17789 // SVE load instructions perform an implicit zero-extend, which makes them
17790 // perfect candidates for combining.
17791 switch (Opc) {
17795 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
17796 break;
17812 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
17813 break;
17814 default:
17815 return SDValue();
17816 }
17817
17818 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
17819 return Src;
17820
17821 return SDValue();
17822}
17823
17824// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
17827
17828 // This function performs an optimization on a specific pattern involving
17829 // an AND operation and SETCC (Set Condition Code) node.
17830
17831 SDValue SetCC = N->getOperand(0);
17832 EVT VT = N->getValueType(0);
17833 SelectionDAG &DAG = DCI.DAG;
17834
17835 // Checks if the current node (N) is used by any SELECT instruction and
17836 // returns an empty SDValue to avoid applying the optimization to prevent
17837 // incorrect results
17838 for (auto U : N->uses())
17839 if (U->getOpcode() == ISD::SELECT)
17840 return SDValue();
17841
17842 // Check if the operand is a SETCC node with floating-point comparison
17843 if (SetCC.getOpcode() == ISD::SETCC &&
17844 SetCC.getOperand(0).getValueType() == MVT::f32) {
17845
17846 SDValue Cmp;
17848
17849 // Check if the DAG is after legalization and if we can emit the conjunction
17850 if (!DCI.isBeforeLegalize() &&
17851 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
17852
17854
17855 SDLoc DL(N);
17856 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
17857 DAG.getConstant(0, DL, VT),
17858 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
17859 }
17860 }
17861 return SDValue();
17862}
17863
17866 SelectionDAG &DAG = DCI.DAG;
17867 SDValue LHS = N->getOperand(0);
17868 SDValue RHS = N->getOperand(1);
17869 EVT VT = N->getValueType(0);
17870
17871 if (SDValue R = performANDORCSELCombine(N, DAG))
17872 return R;
17873
17874 if (SDValue R = performANDSETCCCombine(N,DCI))
17875 return R;
17876
17877 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
17878 return SDValue();
17879
17880 if (VT.isScalableVector())
17881 return performSVEAndCombine(N, DCI);
17882
17883 // The combining code below works only for NEON vectors. In particular, it
17884 // does not work for SVE when dealing with vectors wider than 128 bits.
17885 if (!VT.is64BitVector() && !VT.is128BitVector())
17886 return SDValue();
17887
17888 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
17889 if (!BVN)
17890 return SDValue();
17891
17892 // AND does not accept an immediate, so check if we can use a BIC immediate
17893 // instruction instead. We do this here instead of using a (and x, (mvni imm))
17894 // pattern in isel, because some immediates may be lowered to the preferred
17895 // (and x, (movi imm)) form, even though an mvni representation also exists.
17896 APInt DefBits(VT.getSizeInBits(), 0);
17897 APInt UndefBits(VT.getSizeInBits(), 0);
17898 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
17899 SDValue NewOp;
17900
17901 // Any bits known to already be 0 need not be cleared again, which can help
17902 // reduce the size of the immediate to one supported by the instruction.
17903 KnownBits Known = DAG.computeKnownBits(LHS);
17904 APInt ZeroSplat(VT.getSizeInBits(), 0);
17905 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
17906 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
17907 << (Known.Zero.getBitWidth() * I);
17908
17909 DefBits = ~(DefBits | ZeroSplat);
17910 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
17911 DefBits, &LHS)) ||
17912 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
17913 DefBits, &LHS)))
17914 return NewOp;
17915
17916 UndefBits = ~(UndefBits | ZeroSplat);
17917 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
17918 UndefBits, &LHS)) ||
17919 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
17920 UndefBits, &LHS)))
17921 return NewOp;
17922 }
17923
17924 return SDValue();
17925}
17926
17929 SelectionDAG &DAG = DCI.DAG;
17930 SDValue LHS = N->getOperand(0);
17931 SDValue RHS = N->getOperand(1);
17932 EVT VT = N->getValueType(0);
17933 SDLoc DL(N);
17934
17935 if (!N->getFlags().hasAllowReassociation())
17936 return SDValue();
17937
17938 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
17939 auto ReassocComplex = [&](SDValue A, SDValue B) {
17940 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
17941 return SDValue();
17942 unsigned Opc = A.getConstantOperandVal(0);
17943 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
17944 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
17945 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
17946 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
17947 return SDValue();
17948 SDValue VCMLA = DAG.getNode(
17949 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
17950 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
17951 A.getOperand(2), A.getOperand(3));
17952 VCMLA->setFlags(A->getFlags());
17953 return VCMLA;
17954 };
17955 if (SDValue R = ReassocComplex(LHS, RHS))
17956 return R;
17957 if (SDValue R = ReassocComplex(RHS, LHS))
17958 return R;
17959
17960 return SDValue();
17961}
17962
17963static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
17964 switch (Opcode) {
17965 case ISD::STRICT_FADD:
17966 case ISD::FADD:
17967 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
17968 case ISD::ADD:
17969 return VT == MVT::i64;
17970 default:
17971 return false;
17972 }
17973}
17974
17975static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
17977
17979 if ((N.getOpcode() == ISD::SETCC) ||
17980 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
17981 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
17982 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
17983 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
17984 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
17985 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
17986 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
17987 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
17988 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
17989 // get_active_lane_mask is lowered to a whilelo instruction.
17990 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
17991 return true;
17992
17993 return false;
17994}
17995
17996// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
17997// ... into: "ptrue p, all" + PTEST
17998static SDValue
18001 const AArch64Subtarget *Subtarget) {
18002 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18003 // Make sure PTEST can be legalised with illegal types.
18004 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18005 return SDValue();
18006
18007 SDValue N0 = N->getOperand(0);
18008 EVT VT = N0.getValueType();
18009
18010 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18011 !isNullConstant(N->getOperand(1)))
18012 return SDValue();
18013
18014 // Restricted the DAG combine to only cases where we're extracting from a
18015 // flag-setting operation.
18016 if (!isPredicateCCSettingOp(N0))
18017 return SDValue();
18018
18019 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18020 SelectionDAG &DAG = DCI.DAG;
18021 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18022 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
18023}
18024
18025// Materialize : Idx = (add (mul vscale, NumEls), -1)
18026// i1 = extract_vector_elt t37, Constant:i64<Idx>
18027// ... into: "ptrue p, all" + PTEST
18028static SDValue
18031 const AArch64Subtarget *Subtarget) {
18032 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18033 // Make sure PTEST is legal types.
18034 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18035 return SDValue();
18036
18037 SDValue N0 = N->getOperand(0);
18038 EVT OpVT = N0.getValueType();
18039
18040 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18041 return SDValue();
18042
18043 // Idx == (add (mul vscale, NumEls), -1)
18044 SDValue Idx = N->getOperand(1);
18045 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
18046 return SDValue();
18047
18048 SDValue VS = Idx.getOperand(0);
18049 if (VS.getOpcode() != ISD::VSCALE)
18050 return SDValue();
18051
18052 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18053 if (VS.getConstantOperandVal(0) != NumEls)
18054 return SDValue();
18055
18056 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18057 SelectionDAG &DAG = DCI.DAG;
18058 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18059 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
18060}
18061
18062static SDValue
18064 const AArch64Subtarget *Subtarget) {
18065 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18066 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18067 return Res;
18068 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18069 return Res;
18070
18071 SelectionDAG &DAG = DCI.DAG;
18072 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18073
18074 EVT VT = N->getValueType(0);
18075 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18076 bool IsStrict = N0->isStrictFPOpcode();
18077
18078 // extract(dup x) -> x
18079 if (N0.getOpcode() == AArch64ISD::DUP)
18080 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
18081 : N0.getOperand(0);
18082
18083 // Rewrite for pairwise fadd pattern
18084 // (f32 (extract_vector_elt
18085 // (fadd (vXf32 Other)
18086 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18087 // ->
18088 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18089 // (extract_vector_elt (vXf32 Other) 1))
18090 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18091 // we can only do this when it's used only by the extract_vector_elt.
18092 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
18093 (!IsStrict || N0.hasOneUse())) {
18094 SDLoc DL(N0);
18095 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
18096 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
18097
18098 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
18099 SDValue Other = N00;
18100
18101 // And handle the commutative case.
18102 if (!Shuffle) {
18103 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
18104 Other = N01;
18105 }
18106
18107 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
18108 Other == Shuffle->getOperand(0)) {
18109 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18110 DAG.getConstant(0, DL, MVT::i64));
18111 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18112 DAG.getConstant(1, DL, MVT::i64));
18113 if (!IsStrict)
18114 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
18115
18116 // For strict_fadd we need uses of the final extract_vector to be replaced
18117 // with the strict_fadd, but we also need uses of the chain output of the
18118 // original strict_fadd to use the chain output of the new strict_fadd as
18119 // otherwise it may not be deleted.
18120 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
18121 {VT, MVT::Other},
18122 {N0->getOperand(0), Extract1, Extract2});
18123 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
18124 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
18125 return SDValue(N, 0);
18126 }
18127 }
18128
18129 return SDValue();
18130}
18131
18134 SelectionDAG &DAG) {
18135 SDLoc dl(N);
18136 EVT VT = N->getValueType(0);
18137 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18138 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
18139
18140 if (VT.isScalableVector())
18141 return SDValue();
18142
18143 // Optimize concat_vectors of truncated vectors, where the intermediate
18144 // type is illegal, to avoid said illegality, e.g.,
18145 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
18146 // (v2i16 (truncate (v2i64)))))
18147 // ->
18148 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
18149 // (v4i32 (bitcast (v2i64))),
18150 // <0, 2, 4, 6>)))
18151 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
18152 // on both input and result type, so we might generate worse code.
18153 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
18154 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18155 N1Opc == ISD::TRUNCATE) {
18156 SDValue N00 = N0->getOperand(0);
18157 SDValue N10 = N1->getOperand(0);
18158 EVT N00VT = N00.getValueType();
18159
18160 if (N00VT == N10.getValueType() &&
18161 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
18162 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
18163 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
18165 for (size_t i = 0; i < Mask.size(); ++i)
18166 Mask[i] = i * 2;
18167 return DAG.getNode(ISD::TRUNCATE, dl, VT,
18168 DAG.getVectorShuffle(
18169 MidVT, dl,
18170 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
18171 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
18172 }
18173 }
18174
18175 if (N->getOperand(0).getValueType() == MVT::v4i8) {
18176 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
18177 // loads to prevent having to go through the v4i8 load legalization that
18178 // needs to extend each element into a larger type.
18179 if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
18180 if (V.getValueType() != MVT::v4i8)
18181 return false;
18182 if (V.isUndef())
18183 return true;
18184 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
18185 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
18186 LD->getExtensionType() == ISD::NON_EXTLOAD;
18187 })) {
18188 EVT NVT =
18189 EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
18191
18192 for (unsigned i = 0; i < N->getNumOperands(); i++) {
18193 SDValue V = N->getOperand(i);
18194 if (V.isUndef())
18195 Ops.push_back(DAG.getUNDEF(MVT::f32));
18196 else {
18197 LoadSDNode *LD = cast<LoadSDNode>(V);
18198 SDValue NewLoad =
18199 DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
18200 LD->getMemOperand());
18201 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
18202 Ops.push_back(NewLoad);
18203 }
18204 }
18205 return DAG.getBitcast(N->getValueType(0),
18206 DAG.getBuildVector(NVT, dl, Ops));
18207 }
18208 }
18209
18210 // Canonicalise concat_vectors to replace concatenations of truncated nots
18211 // with nots of concatenated truncates. This in some cases allows for multiple
18212 // redundant negations to be eliminated.
18213 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
18214 // (v4i16 (truncate (not (v4i32)))))
18215 // ->
18216 // (not (concat_vectors (v4i16 (truncate (v4i32))),
18217 // (v4i16 (truncate (v4i32)))))
18218 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18219 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
18220 N->isOnlyUserOf(N1.getNode())) {
18221 auto isBitwiseVectorNegate = [](SDValue V) {
18222 return V->getOpcode() == ISD::XOR &&
18223 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
18224 };
18225 SDValue N00 = N0->getOperand(0);
18226 SDValue N10 = N1->getOperand(0);
18227 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
18228 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
18229 return DAG.getNOT(
18230 dl,
18231 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18232 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
18233 N00->getOperand(0)),
18234 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
18235 N10->getOperand(0))),
18236 VT);
18237 }
18238 }
18239
18240 // Wait till after everything is legalized to try this. That way we have
18241 // legal vector types and such.
18242 if (DCI.isBeforeLegalizeOps())
18243 return SDValue();
18244
18245 // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use
18246 // extracted subvectors from the same original vectors. Combine these into a
18247 // single avg that operates on the two original vectors.
18248 // avgceil is the target independant name for rhadd, avgfloor is a hadd.
18249 // Example:
18250 // (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>),
18251 // extract_subvector (v16i8 OpB, <0>))),
18252 // (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>),
18253 // extract_subvector (v16i8 OpB, <8>)))))
18254 // ->
18255 // (v16i8(avgceils(v16i8 OpA, v16i8 OpB)))
18256 if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
18257 (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
18258 N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) {
18259 SDValue N00 = N0->getOperand(0);
18260 SDValue N01 = N0->getOperand(1);
18261 SDValue N10 = N1->getOperand(0);
18262 SDValue N11 = N1->getOperand(1);
18263
18264 EVT N00VT = N00.getValueType();
18265 EVT N10VT = N10.getValueType();
18266
18267 if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18270 N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
18271 SDValue N00Source = N00->getOperand(0);
18272 SDValue N01Source = N01->getOperand(0);
18273 SDValue N10Source = N10->getOperand(0);
18274 SDValue N11Source = N11->getOperand(0);
18275
18276 if (N00Source == N10Source && N01Source == N11Source &&
18277 N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
18278 assert(N0.getValueType() == N1.getValueType());
18279
18280 uint64_t N00Index = N00.getConstantOperandVal(1);
18281 uint64_t N01Index = N01.getConstantOperandVal(1);
18282 uint64_t N10Index = N10.getConstantOperandVal(1);
18283 uint64_t N11Index = N11.getConstantOperandVal(1);
18284
18285 if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
18286 N10Index == N00VT.getVectorNumElements())
18287 return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
18288 }
18289 }
18290 }
18291
18292 auto IsRSHRN = [](SDValue Shr) {
18293 if (Shr.getOpcode() != AArch64ISD::VLSHR)
18294 return false;
18295 SDValue Op = Shr.getOperand(0);
18296 EVT VT = Op.getValueType();
18297 unsigned ShtAmt = Shr.getConstantOperandVal(1);
18298 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
18299 return false;
18300
18301 APInt Imm;
18302 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
18303 Imm = APInt(VT.getScalarSizeInBits(),
18304 Op.getOperand(1).getConstantOperandVal(0)
18305 << Op.getOperand(1).getConstantOperandVal(1));
18306 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
18307 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
18308 Imm = APInt(VT.getScalarSizeInBits(),
18309 Op.getOperand(1).getConstantOperandVal(0));
18310 else
18311 return false;
18312
18313 if (Imm != 1ULL << (ShtAmt - 1))
18314 return false;
18315 return true;
18316 };
18317
18318 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
18319 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
18320 ((IsRSHRN(N1) &&
18322 N1.isUndef())) {
18323 SDValue X = N0.getOperand(0).getOperand(0);
18324 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
18325 : N1.getOperand(0).getOperand(0);
18326 EVT BVT =
18327 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
18328 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
18329 SDValue Add = DAG.getNode(
18330 ISD::ADD, dl, BVT, CC,
18331 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
18332 SDValue Shr =
18333 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
18334 return Shr;
18335 }
18336
18337 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
18338 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
18339 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
18340 N0.getOperand(1) == N1.getOperand(1)) {
18341 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
18342 DAG.getUNDEF(N0.getValueType()));
18343 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
18344 DAG.getUNDEF(N0.getValueType()));
18345 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
18346 }
18347
18348 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
18349 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
18350 // canonicalise to that.
18351 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
18352 assert(VT.getScalarSizeInBits() == 64);
18353 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
18354 DAG.getConstant(0, dl, MVT::i64));
18355 }
18356
18357 // Canonicalise concat_vectors so that the right-hand vector has as few
18358 // bit-casts as possible before its real operation. The primary matching
18359 // destination for these operations will be the narrowing "2" instructions,
18360 // which depend on the operation being performed on this right-hand vector.
18361 // For example,
18362 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
18363 // becomes
18364 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
18365
18366 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
18367 return SDValue();
18368 SDValue RHS = N1->getOperand(0);
18369 MVT RHSTy = RHS.getValueType().getSimpleVT();
18370 // If the RHS is not a vector, this is not the pattern we're looking for.
18371 if (!RHSTy.isVector())
18372 return SDValue();
18373
18374 LLVM_DEBUG(
18375 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
18376
18377 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
18378 RHSTy.getVectorNumElements() * 2);
18379 return DAG.getNode(ISD::BITCAST, dl, VT,
18380 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
18381 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
18382 RHS));
18383}
18384
18385static SDValue
18387 SelectionDAG &DAG) {
18388 if (DCI.isBeforeLegalizeOps())
18389 return SDValue();
18390
18391 EVT VT = N->getValueType(0);
18392 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
18393 return SDValue();
18394
18395 SDValue V = N->getOperand(0);
18396
18397 // NOTE: This combine exists in DAGCombiner, but that version's legality check
18398 // blocks this combine because the non-const case requires custom lowering.
18399 //
18400 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
18401 if (V.getOpcode() == ISD::SPLAT_VECTOR)
18402 if (isa<ConstantSDNode>(V.getOperand(0)))
18403 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
18404
18405 return SDValue();
18406}
18407
18408static SDValue
18410 SelectionDAG &DAG) {
18411 SDLoc DL(N);
18412 SDValue Vec = N->getOperand(0);
18413 SDValue SubVec = N->getOperand(1);
18414 uint64_t IdxVal = N->getConstantOperandVal(2);
18415 EVT VecVT = Vec.getValueType();
18416 EVT SubVT = SubVec.getValueType();
18417
18418 // Only do this for legal fixed vector types.
18419 if (!VecVT.isFixedLengthVector() ||
18420 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
18421 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
18422 return SDValue();
18423
18424 // Ignore widening patterns.
18425 if (IdxVal == 0 && Vec.isUndef())
18426 return SDValue();
18427
18428 // Subvector must be half the width and an "aligned" insertion.
18429 unsigned NumSubElts = SubVT.getVectorNumElements();
18430 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
18431 (IdxVal != 0 && IdxVal != NumSubElts))
18432 return SDValue();
18433
18434 // Fold insert_subvector -> concat_vectors
18435 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
18436 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
18437 SDValue Lo, Hi;
18438 if (IdxVal == 0) {
18439 Lo = SubVec;
18440 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18441 DAG.getVectorIdxConstant(NumSubElts, DL));
18442 } else {
18443 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18444 DAG.getVectorIdxConstant(0, DL));
18445 Hi = SubVec;
18446 }
18447 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
18448}
18449
18452 SelectionDAG &DAG) {
18453 // Wait until after everything is legalized to try this. That way we have
18454 // legal vector types and such.
18455 if (DCI.isBeforeLegalizeOps())
18456 return SDValue();
18457 // Transform a scalar conversion of a value from a lane extract into a
18458 // lane extract of a vector conversion. E.g., from foo1 to foo2:
18459 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
18460 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
18461 //
18462 // The second form interacts better with instruction selection and the
18463 // register allocator to avoid cross-class register copies that aren't
18464 // coalescable due to a lane reference.
18465
18466 // Check the operand and see if it originates from a lane extract.
18467 SDValue Op1 = N->getOperand(1);
18469 return SDValue();
18470
18471 // Yep, no additional predication needed. Perform the transform.
18472 SDValue IID = N->getOperand(0);
18473 SDValue Shift = N->getOperand(2);
18474 SDValue Vec = Op1.getOperand(0);
18475 SDValue Lane = Op1.getOperand(1);
18476 EVT ResTy = N->getValueType(0);
18477 EVT VecResTy;
18478 SDLoc DL(N);
18479
18480 // The vector width should be 128 bits by the time we get here, even
18481 // if it started as 64 bits (the extract_vector handling will have
18482 // done so). Bail if it is not.
18483 if (Vec.getValueSizeInBits() != 128)
18484 return SDValue();
18485
18486 if (Vec.getValueType() == MVT::v4i32)
18487 VecResTy = MVT::v4f32;
18488 else if (Vec.getValueType() == MVT::v2i64)
18489 VecResTy = MVT::v2f64;
18490 else
18491 return SDValue();
18492
18493 SDValue Convert =
18494 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
18495 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
18496}
18497
18498// AArch64 high-vector "long" operations are formed by performing the non-high
18499// version on an extract_subvector of each operand which gets the high half:
18500//
18501// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
18502//
18503// However, there are cases which don't have an extract_high explicitly, but
18504// have another operation that can be made compatible with one for free. For
18505// example:
18506//
18507// (dupv64 scalar) --> (extract_high (dup128 scalar))
18508//
18509// This routine does the actual conversion of such DUPs, once outer routines
18510// have determined that everything else is in order.
18511// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
18512// similarly here.
18514 MVT VT = N.getSimpleValueType();
18515 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18516 N.getConstantOperandVal(1) == 0)
18517 N = N.getOperand(0);
18518
18519 switch (N.getOpcode()) {
18520 case AArch64ISD::DUP:
18525 case AArch64ISD::MOVI:
18531 break;
18532 default:
18533 // FMOV could be supported, but isn't very useful, as it would only occur
18534 // if you passed a bitcast' floating point immediate to an eligible long
18535 // integer op (addl, smull, ...).
18536 return SDValue();
18537 }
18538
18539 if (!VT.is64BitVector())
18540 return SDValue();
18541
18542 SDLoc DL(N);
18543 unsigned NumElems = VT.getVectorNumElements();
18544 if (N.getValueType().is64BitVector()) {
18545 MVT ElementTy = VT.getVectorElementType();
18546 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
18547 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
18548 }
18549
18550 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
18551 DAG.getConstant(NumElems, DL, MVT::i64));
18552}
18553
18555 if (N.getOpcode() == ISD::BITCAST)
18556 N = N.getOperand(0);
18557 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
18558 return false;
18559 if (N.getOperand(0).getValueType().isScalableVector())
18560 return false;
18561 return N.getConstantOperandAPInt(1) ==
18562 N.getOperand(0).getValueType().getVectorNumElements() / 2;
18563}
18564
18565/// Helper structure to keep track of ISD::SET_CC operands.
18570};
18571
18572/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
18574 const SDValue *Cmp;
18576};
18577
18578/// Helper structure to keep track of SetCC information.
18582};
18583
18584/// Helper structure to be able to read SetCC information. If set to
18585/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
18586/// GenericSetCCInfo.
18590};
18591
18592/// Check whether or not \p Op is a SET_CC operation, either a generic or
18593/// an
18594/// AArch64 lowered one.
18595/// \p SetCCInfo is filled accordingly.
18596/// \post SetCCInfo is meanginfull only when this function returns true.
18597/// \return True when Op is a kind of SET_CC operation.
18599 // If this is a setcc, this is straight forward.
18600 if (Op.getOpcode() == ISD::SETCC) {
18601 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
18602 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
18603 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18604 SetCCInfo.IsAArch64 = false;
18605 return true;
18606 }
18607 // Otherwise, check if this is a matching csel instruction.
18608 // In other words:
18609 // - csel 1, 0, cc
18610 // - csel 0, 1, !cc
18611 if (Op.getOpcode() != AArch64ISD::CSEL)
18612 return false;
18613 // Set the information about the operands.
18614 // TODO: we want the operands of the Cmp not the csel
18615 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
18616 SetCCInfo.IsAArch64 = true;
18617 SetCCInfo.Info.AArch64.CC =
18618 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
18619
18620 // Check that the operands matches the constraints:
18621 // (1) Both operands must be constants.
18622 // (2) One must be 1 and the other must be 0.
18623 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
18624 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
18625
18626 // Check (1).
18627 if (!TValue || !FValue)
18628 return false;
18629
18630 // Check (2).
18631 if (!TValue->isOne()) {
18632 // Update the comparison when we are interested in !cc.
18633 std::swap(TValue, FValue);
18634 SetCCInfo.Info.AArch64.CC =
18636 }
18637 return TValue->isOne() && FValue->isZero();
18638}
18639
18640// Returns true if Op is setcc or zext of setcc.
18641static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
18642 if (isSetCC(Op, Info))
18643 return true;
18644 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
18645 isSetCC(Op->getOperand(0), Info));
18646}
18647
18648// The folding we want to perform is:
18649// (add x, [zext] (setcc cc ...) )
18650// -->
18651// (csel x, (add x, 1), !cc ...)
18652//
18653// The latter will get matched to a CSINC instruction.
18655 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
18656 SDValue LHS = Op->getOperand(0);
18657 SDValue RHS = Op->getOperand(1);
18658 SetCCInfoAndKind InfoAndKind;
18659
18660 // If both operands are a SET_CC, then we don't want to perform this
18661 // folding and create another csel as this results in more instructions
18662 // (and higher register usage).
18663 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
18664 isSetCCOrZExtSetCC(RHS, InfoAndKind))
18665 return SDValue();
18666
18667 // If neither operand is a SET_CC, give up.
18668 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
18669 std::swap(LHS, RHS);
18670 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
18671 return SDValue();
18672 }
18673
18674 // FIXME: This could be generatized to work for FP comparisons.
18675 EVT CmpVT = InfoAndKind.IsAArch64
18676 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
18677 : InfoAndKind.Info.Generic.Opnd0->getValueType();
18678 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
18679 return SDValue();
18680
18681 SDValue CCVal;
18682 SDValue Cmp;
18683 SDLoc dl(Op);
18684 if (InfoAndKind.IsAArch64) {
18685 CCVal = DAG.getConstant(
18687 MVT::i32);
18688 Cmp = *InfoAndKind.Info.AArch64.Cmp;
18689 } else
18690 Cmp = getAArch64Cmp(
18691 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
18692 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
18693 dl);
18694
18695 EVT VT = Op->getValueType(0);
18696 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
18697 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
18698}
18699
18700// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
18702 EVT VT = N->getValueType(0);
18703 // Only scalar integer and vector types.
18704 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
18705 return SDValue();
18706
18707 SDValue LHS = N->getOperand(0);
18708 SDValue RHS = N->getOperand(1);
18709 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
18710 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
18711 return SDValue();
18712
18713 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
18714 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
18715 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
18716 return SDValue();
18717
18718 SDValue Op1 = LHS->getOperand(0);
18719 SDValue Op2 = RHS->getOperand(0);
18720 EVT OpVT1 = Op1.getValueType();
18721 EVT OpVT2 = Op2.getValueType();
18722 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
18723 Op2.getOpcode() != AArch64ISD::UADDV ||
18724 OpVT1.getVectorElementType() != VT)
18725 return SDValue();
18726
18727 SDValue Val1 = Op1.getOperand(0);
18728 SDValue Val2 = Op2.getOperand(0);
18729 EVT ValVT = Val1->getValueType(0);
18730 SDLoc DL(N);
18731 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
18732 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18733 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
18734 DAG.getConstant(0, DL, MVT::i64));
18735}
18736
18737/// Perform the scalar expression combine in the form of:
18738/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
18739/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
18741 EVT VT = N->getValueType(0);
18742 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
18743 return SDValue();
18744
18745 SDValue LHS = N->getOperand(0);
18746 SDValue RHS = N->getOperand(1);
18747
18748 // Handle commutivity.
18749 if (LHS.getOpcode() != AArch64ISD::CSEL &&
18750 LHS.getOpcode() != AArch64ISD::CSNEG) {
18751 std::swap(LHS, RHS);
18752 if (LHS.getOpcode() != AArch64ISD::CSEL &&
18753 LHS.getOpcode() != AArch64ISD::CSNEG) {
18754 return SDValue();
18755 }
18756 }
18757
18758 if (!LHS.hasOneUse())
18759 return SDValue();
18760
18761 AArch64CC::CondCode AArch64CC =
18762 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
18763
18764 // The CSEL should include a const one operand, and the CSNEG should include
18765 // One or NegOne operand.
18766 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
18767 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
18768 if (!CTVal || !CFVal)
18769 return SDValue();
18770
18771 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
18772 (CTVal->isOne() || CFVal->isOne())) &&
18773 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
18774 (CTVal->isOne() || CFVal->isAllOnes())))
18775 return SDValue();
18776
18777 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
18778 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
18779 !CFVal->isOne()) {
18780 std::swap(CTVal, CFVal);
18781 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
18782 }
18783
18784 SDLoc DL(N);
18785 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
18786 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
18787 !CFVal->isAllOnes()) {
18788 APInt C = -1 * CFVal->getAPIntValue();
18789 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
18790 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
18791 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
18792 }
18793
18794 // It might be neutral for larger constants, as the immediate need to be
18795 // materialized in a register.
18796 APInt ADDC = CTVal->getAPIntValue();
18797 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18798 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
18799 return SDValue();
18800
18801 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
18802 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
18803 "Unexpected constant value");
18804
18805 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
18806 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
18807 SDValue Cmp = LHS.getOperand(3);
18808
18809 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
18810}
18811
18812// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
18814 EVT VT = N->getValueType(0);
18815 if (N->getOpcode() != ISD::ADD)
18816 return SDValue();
18817
18818 SDValue Dot = N->getOperand(0);
18819 SDValue A = N->getOperand(1);
18820 // Handle commutivity
18821 auto isZeroDot = [](SDValue Dot) {
18822 return (Dot.getOpcode() == AArch64ISD::UDOT ||
18823 Dot.getOpcode() == AArch64ISD::SDOT) &&
18825 };
18826 if (!isZeroDot(Dot))
18827 std::swap(Dot, A);
18828 if (!isZeroDot(Dot))
18829 return SDValue();
18830
18831 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
18832 Dot.getOperand(2));
18833}
18834
18836 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
18837}
18838
18840 SDLoc DL(Op);
18841 EVT VT = Op.getValueType();
18842 SDValue Zero = DAG.getConstant(0, DL, VT);
18843 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
18844}
18845
18846// Try to fold
18847//
18848// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
18849//
18850// The folding helps csel to be matched with csneg without generating
18851// redundant neg instruction, which includes negation of the csel expansion
18852// of abs node lowered by lowerABS.
18854 if (!isNegatedInteger(SDValue(N, 0)))
18855 return SDValue();
18856
18857 SDValue CSel = N->getOperand(1);
18858 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
18859 return SDValue();
18860
18861 SDValue N0 = CSel.getOperand(0);
18862 SDValue N1 = CSel.getOperand(1);
18863
18864 // If both of them is not negations, it's not worth the folding as it
18865 // introduces two additional negations while reducing one negation.
18866 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
18867 return SDValue();
18868
18869 SDValue N0N = getNegatedInteger(N0, DAG);
18870 SDValue N1N = getNegatedInteger(N1, DAG);
18871
18872 SDLoc DL(N);
18873 EVT VT = CSel.getValueType();
18874 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
18875 CSel.getOperand(3));
18876}
18877
18878// The basic add/sub long vector instructions have variants with "2" on the end
18879// which act on the high-half of their inputs. They are normally matched by
18880// patterns like:
18881//
18882// (add (zeroext (extract_high LHS)),
18883// (zeroext (extract_high RHS)))
18884// -> uaddl2 vD, vN, vM
18885//
18886// However, if one of the extracts is something like a duplicate, this
18887// instruction can still be used profitably. This function puts the DAG into a
18888// more appropriate form for those patterns to trigger.
18891 SelectionDAG &DAG = DCI.DAG;
18892 if (DCI.isBeforeLegalizeOps())
18893 return SDValue();
18894
18895 MVT VT = N->getSimpleValueType(0);
18896 if (!VT.is128BitVector()) {
18897 if (N->getOpcode() == ISD::ADD)
18898 return performSetccAddFolding(N, DAG);
18899 return SDValue();
18900 }
18901
18902 // Make sure both branches are extended in the same way.
18903 SDValue LHS = N->getOperand(0);
18904 SDValue RHS = N->getOperand(1);
18905 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
18906 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
18907 LHS.getOpcode() != RHS.getOpcode())
18908 return SDValue();
18909
18910 unsigned ExtType = LHS.getOpcode();
18911
18912 // It's not worth doing if at least one of the inputs isn't already an
18913 // extract, but we don't know which it'll be so we have to try both.
18914 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
18915 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
18916 if (!RHS.getNode())
18917 return SDValue();
18918
18919 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
18920 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
18921 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
18922 if (!LHS.getNode())
18923 return SDValue();
18924
18925 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
18926 }
18927
18928 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
18929}
18930
18931static bool isCMP(SDValue Op) {
18932 return Op.getOpcode() == AArch64ISD::SUBS &&
18933 !Op.getNode()->hasAnyUseOfValue(0);
18934}
18935
18936// (CSEL 1 0 CC Cond) => CC
18937// (CSEL 0 1 CC Cond) => !CC
18938static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
18939 if (Op.getOpcode() != AArch64ISD::CSEL)
18940 return std::nullopt;
18941 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
18942 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
18943 return std::nullopt;
18944 SDValue OpLHS = Op.getOperand(0);
18945 SDValue OpRHS = Op.getOperand(1);
18946 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
18947 return CC;
18948 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
18949 return getInvertedCondCode(CC);
18950
18951 return std::nullopt;
18952}
18953
18954// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
18955// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
18956static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
18957 SDValue CmpOp = Op->getOperand(2);
18958 if (!isCMP(CmpOp))
18959 return SDValue();
18960
18961 if (IsAdd) {
18962 if (!isOneConstant(CmpOp.getOperand(1)))
18963 return SDValue();
18964 } else {
18965 if (!isNullConstant(CmpOp.getOperand(0)))
18966 return SDValue();
18967 }
18968
18969 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
18970 auto CC = getCSETCondCode(CsetOp);
18971 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
18972 return SDValue();
18973
18974 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
18975 Op->getOperand(0), Op->getOperand(1),
18976 CsetOp.getOperand(3));
18977}
18978
18979// (ADC x 0 cond) => (CINC x HS cond)
18981 SDValue LHS = N->getOperand(0);
18982 SDValue RHS = N->getOperand(1);
18983 SDValue Cond = N->getOperand(2);
18984
18985 if (!isNullConstant(RHS))
18986 return SDValue();
18987
18988 EVT VT = N->getValueType(0);
18989 SDLoc DL(N);
18990
18991 // (CINC x cc cond) <=> (CSINC x x !cc cond)
18992 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
18993 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
18994}
18995
18996// Transform vector add(zext i8 to i32, zext i8 to i32)
18997// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
18998// This allows extra uses of saddl/uaddl at the lower vector widths, and less
18999// extends.
19001 EVT VT = N->getValueType(0);
19002 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19003 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19004 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19005 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19006 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19007 N->getOperand(0).getOperand(0).getValueType() !=
19008 N->getOperand(1).getOperand(0).getValueType())
19009 return SDValue();
19010
19011 SDValue N0 = N->getOperand(0).getOperand(0);
19012 SDValue N1 = N->getOperand(1).getOperand(0);
19013 EVT InVT = N0.getValueType();
19014
19015 EVT S1 = InVT.getScalarType();
19016 EVT S2 = VT.getScalarType();
19017 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19018 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19019 SDLoc DL(N);
19020 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19023 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19024 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19025 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19026 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
19027 }
19028 return SDValue();
19029}
19030
19033 SelectionDAG &DAG) {
19034 SDLoc DL(N);
19035 EVT VT = N->getValueType(0);
19036
19037 // A build vector of two extracted elements is equivalent to an
19038 // extract subvector where the inner vector is any-extended to the
19039 // extract_vector_elt VT.
19040 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19041 // (extract_elt_iXX_to_i32 vec Idx+1))
19042 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19043
19044 // For now, only consider the v2i32 case, which arises as a result of
19045 // legalization.
19046 if (VT != MVT::v2i32)
19047 return SDValue();
19048
19049 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19050 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19051 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19052 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19053 // Constant index.
19054 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19055 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19056 // Both EXTRACT_VECTOR_ELT from same vector...
19057 Elt0->getOperand(0) == Elt1->getOperand(0) &&
19058 // ... and contiguous. First element's index +1 == second element's index.
19059 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
19060 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19061 // ResultType's known minimum vector length.
19062 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
19063 SDValue VecToExtend = Elt0->getOperand(0);
19064 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19065 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
19066 return SDValue();
19067
19068 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
19069
19070 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
19071 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
19072 SubvectorIdx);
19073 }
19074
19075 return SDValue();
19076}
19077
19079 SelectionDAG &DAG) {
19080 EVT VT = N->getValueType(0);
19081 SDValue N0 = N->getOperand(0);
19082 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
19083 N0.getOpcode() == AArch64ISD::DUP) {
19084 SDValue Op = N0.getOperand(0);
19085 if (VT.getScalarType() == MVT::i32 &&
19086 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
19087 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
19088 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
19089 }
19090
19091 return SDValue();
19092}
19093
19094// Check an node is an extend or shift operand
19096 unsigned Opcode = N.getOpcode();
19097 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
19098 EVT SrcVT;
19099 if (Opcode == ISD::SIGN_EXTEND_INREG)
19100 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
19101 else
19102 SrcVT = N.getOperand(0).getValueType();
19103
19104 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
19105 } else if (Opcode == ISD::AND) {
19106 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
19107 if (!CSD)
19108 return false;
19109 uint64_t AndMask = CSD->getZExtValue();
19110 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
19111 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
19112 return isa<ConstantSDNode>(N.getOperand(1));
19113 }
19114
19115 return false;
19116}
19117
19118// (N - Y) + Z --> (Z - Y) + N
19119// when N is an extend or shift operand
19121 SelectionDAG &DAG) {
19122 auto IsOneUseExtend = [](SDValue N) {
19123 return N.hasOneUse() && isExtendOrShiftOperand(N);
19124 };
19125
19126 // DAGCombiner will revert the combination when Z is constant cause
19127 // dead loop. So don't enable the combination when Z is constant.
19128 // If Z is one use shift C, we also can't do the optimization.
19129 // It will falling to self infinite loop.
19130 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
19131 return SDValue();
19132
19133 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
19134 return SDValue();
19135
19136 SDValue Shift = SUB.getOperand(0);
19137 if (!IsOneUseExtend(Shift))
19138 return SDValue();
19139
19140 SDLoc DL(N);
19141 EVT VT = N->getValueType(0);
19142
19143 SDValue Y = SUB.getOperand(1);
19144 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
19145 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
19146}
19147
19149 SelectionDAG &DAG) {
19150 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
19151 // commutative.
19152 if (N->getOpcode() != ISD::ADD)
19153 return SDValue();
19154
19155 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
19156 // shifted register is only available for i32 and i64.
19157 EVT VT = N->getValueType(0);
19158 if (VT != MVT::i32 && VT != MVT::i64)
19159 return SDValue();
19160
19161 SDLoc DL(N);
19162 SDValue LHS = N->getOperand(0);
19163 SDValue RHS = N->getOperand(1);
19164
19165 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
19166 return Val;
19167 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
19168 return Val;
19169
19170 uint64_t LHSImm = 0, RHSImm = 0;
19171 // If both operand are shifted by imm and shift amount is not greater than 4
19172 // for one operand, swap LHS and RHS to put operand with smaller shift amount
19173 // on RHS.
19174 //
19175 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
19176 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
19177 // with LSL (shift > 4). For the rest of processors, this is no-op for
19178 // performance or correctness.
19179 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
19180 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
19181 RHSImm > 4 && LHS.hasOneUse())
19182 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
19183
19184 return SDValue();
19185}
19186
19187// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
19188// This reassociates it back to allow the creation of more mls instructions.
19190 if (N->getOpcode() != ISD::SUB)
19191 return SDValue();
19192
19193 SDValue Add = N->getOperand(1);
19194 SDValue X = N->getOperand(0);
19195 if (Add.getOpcode() != ISD::ADD)
19196 return SDValue();
19197
19198 if (!Add.hasOneUse())
19199 return SDValue();
19201 return SDValue();
19202
19203 SDValue M1 = Add.getOperand(0);
19204 SDValue M2 = Add.getOperand(1);
19205 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
19206 M1.getOpcode() != AArch64ISD::UMULL)
19207 return SDValue();
19208 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
19210 return SDValue();
19211
19212 EVT VT = N->getValueType(0);
19213 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
19214 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
19215}
19216
19217// Combine into mla/mls.
19218// This works on the patterns of:
19219// add v1, (mul v2, v3)
19220// sub v1, (mul v2, v3)
19221// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
19222// It will transform the add/sub to a scalable version, so that we can
19223// make use of SVE's MLA/MLS that will be generated for that pattern
19224static SDValue
19226 SelectionDAG &DAG = DCI.DAG;
19227 // Make sure that the types are legal
19228 if (!DCI.isAfterLegalizeDAG())
19229 return SDValue();
19230 // Before using SVE's features, check first if it's available.
19231 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
19232 return SDValue();
19233
19234 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
19235 return SDValue();
19236
19237 if (!N->getValueType(0).isFixedLengthVector())
19238 return SDValue();
19239
19240 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
19241 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19242 return SDValue();
19243
19244 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
19245 return SDValue();
19246
19247 SDValue MulValue = Op1->getOperand(0);
19248 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
19249 return SDValue();
19250
19251 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
19252 return SDValue();
19253
19254 EVT ScalableVT = MulValue.getValueType();
19255 if (!ScalableVT.isScalableVector())
19256 return SDValue();
19257
19258 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
19259 SDValue NewValue =
19260 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
19261 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
19262 };
19263
19264 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
19265 return res;
19266 else if (N->getOpcode() == ISD::ADD)
19267 return performOpt(N->getOperand(1), N->getOperand(0));
19268
19269 return SDValue();
19270}
19271
19272// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
19273// help, for example, to produce ssra from sshr+add.
19275 EVT VT = N->getValueType(0);
19276 if (VT != MVT::i64)
19277 return SDValue();
19278 SDValue Op0 = N->getOperand(0);
19279 SDValue Op1 = N->getOperand(1);
19280
19281 // At least one of the operands should be an extract, and the other should be
19282 // something that is easy to convert to v1i64 type (in this case a load).
19283 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19284 Op0.getOpcode() != ISD::LOAD)
19285 return SDValue();
19286 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19287 Op1.getOpcode() != ISD::LOAD)
19288 return SDValue();
19289
19290 SDLoc DL(N);
19291 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19292 Op0.getOperand(0).getValueType() == MVT::v1i64) {
19293 Op0 = Op0.getOperand(0);
19294 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
19295 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19296 Op1.getOperand(0).getValueType() == MVT::v1i64) {
19297 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
19298 Op1 = Op1.getOperand(0);
19299 } else
19300 return SDValue();
19301
19302 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
19303 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
19304 DAG.getConstant(0, DL, MVT::i64));
19305}
19306
19309 if (!BV->hasOneUse())
19310 return false;
19311 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
19312 if (!Ld || !Ld->isSimple())
19313 return false;
19314 Loads.push_back(Ld);
19315 return true;
19316 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
19318 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
19319 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
19320 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
19321 return false;
19322 Loads.push_back(Ld);
19323 }
19324 return true;
19325 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
19326 // Try to find a tree of shuffles and concats from how IR shuffles of loads
19327 // are lowered. Note that this only comes up because we do not always visit
19328 // operands before uses. After that is fixed this can be removed and in the
19329 // meantime this is fairly specific to the lowering we expect from IR.
19330 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
19331 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
19332 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
19333 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
19334 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
19335 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
19336 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
19337 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
19338 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
19339 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
19340 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
19341 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19342 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19343 B.getOperand(1).getNumOperands() != 4)
19344 return false;
19345 auto SV1 = cast<ShuffleVectorSDNode>(B);
19346 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
19347 int NumElts = B.getValueType().getVectorNumElements();
19348 int NumSubElts = NumElts / 4;
19349 for (int I = 0; I < NumSubElts; I++) {
19350 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
19351 if (SV1->getMaskElt(I) != I ||
19352 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19353 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
19354 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
19355 return false;
19356 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
19357 if (SV2->getMaskElt(I) != I ||
19358 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19359 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
19360 return false;
19361 }
19362 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
19363 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
19364 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
19365 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
19366 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
19367 !Ld2->isSimple() || !Ld3->isSimple())
19368 return false;
19369 Loads.push_back(Ld0);
19370 Loads.push_back(Ld1);
19371 Loads.push_back(Ld2);
19372 Loads.push_back(Ld3);
19373 return true;
19374 }
19375 return false;
19376}
19377
19379 SelectionDAG &DAG,
19380 unsigned &NumSubLoads) {
19381 if (!Op0.hasOneUse() || !Op1.hasOneUse())
19382 return false;
19383
19384 SmallVector<LoadSDNode *> Loads0, Loads1;
19385 if (isLoadOrMultipleLoads(Op0, Loads0) &&
19386 isLoadOrMultipleLoads(Op1, Loads1)) {
19387 if (NumSubLoads && Loads0.size() != NumSubLoads)
19388 return false;
19389 NumSubLoads = Loads0.size();
19390 return Loads0.size() == Loads1.size() &&
19391 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
19392 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
19393 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
19394 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
19395 Size / 8, 1);
19396 });
19397 }
19398
19399 if (Op0.getOpcode() != Op1.getOpcode())
19400 return false;
19401
19402 switch (Op0.getOpcode()) {
19403 case ISD::ADD:
19404 case ISD::SUB:
19406 DAG, NumSubLoads) &&
19408 DAG, NumSubLoads);
19409 case ISD::SIGN_EXTEND:
19410 case ISD::ANY_EXTEND:
19411 case ISD::ZERO_EXTEND:
19412 EVT XVT = Op0.getOperand(0).getValueType();
19413 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
19414 XVT.getScalarSizeInBits() != 32)
19415 return false;
19417 DAG, NumSubLoads);
19418 }
19419 return false;
19420}
19421
19422// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
19423// into a single load of twice the size, that we extract the bottom part and top
19424// part so that the shl can use a shll2 instruction. The two loads in that
19425// example can also be larger trees of instructions, which are identical except
19426// for the leaves which are all loads offset from the LHS, including
19427// buildvectors of multiple loads. For example the RHS tree could be
19428// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
19429// Whilst it can be common for the larger loads to replace LDP instructions
19430// (which doesn't gain anything on it's own), the larger loads can help create
19431// more efficient code, and in buildvectors prevent the need for ld1 lane
19432// inserts which can be slower than normal loads.
19434 EVT VT = N->getValueType(0);
19435 if (!VT.isFixedLengthVector() ||
19436 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
19437 VT.getScalarSizeInBits() != 64))
19438 return SDValue();
19439
19440 SDValue Other = N->getOperand(0);
19441 SDValue Shift = N->getOperand(1);
19442 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
19443 std::swap(Shift, Other);
19444 APInt ShiftAmt;
19445 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
19446 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
19447 return SDValue();
19448
19449 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
19450 !ISD::isExtOpcode(Other.getOpcode()) ||
19451 Shift.getOperand(0).getOperand(0).getValueType() !=
19452 Other.getOperand(0).getValueType() ||
19453 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
19454 return SDValue();
19455
19456 SDValue Op0 = Other.getOperand(0);
19457 SDValue Op1 = Shift.getOperand(0).getOperand(0);
19458
19459 unsigned NumSubLoads = 0;
19460 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
19461 return SDValue();
19462
19463 // Attempt to rule out some unprofitable cases using heuristics (some working
19464 // around suboptimal code generation), notably if the extend not be able to
19465 // use ushll2 instructions as the types are not large enough. Otherwise zip's
19466 // will need to be created which can increase the instruction count.
19467 unsigned NumElts = Op0.getValueType().getVectorNumElements();
19468 unsigned NumSubElts = NumElts / NumSubLoads;
19469 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
19470 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
19471 Op0.getValueType().getSizeInBits() < 128 &&
19473 return SDValue();
19474
19475 // Recreate the tree with the new combined loads.
19476 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
19477 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
19478 EVT DVT =
19480
19481 SmallVector<LoadSDNode *> Loads0, Loads1;
19482 if (isLoadOrMultipleLoads(Op0, Loads0) &&
19483 isLoadOrMultipleLoads(Op1, Loads1)) {
19484 EVT LoadVT = EVT::getVectorVT(
19485 *DAG.getContext(), Op0.getValueType().getScalarType(),
19486 Op0.getValueType().getVectorNumElements() / Loads0.size());
19487 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
19488
19489 SmallVector<SDValue> NewLoads;
19490 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
19491 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
19492 L0->getBasePtr(), L0->getPointerInfo(),
19493 L0->getOriginalAlign());
19494 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
19495 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
19496 NewLoads.push_back(Load);
19497 }
19498 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
19499 }
19500
19502 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
19503 Ops.push_back(GenCombinedTree(O0, O1, DAG));
19504 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
19505 };
19506 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
19507
19508 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
19509 int Hi = NumSubElts, Lo = 0;
19510 for (unsigned i = 0; i < NumSubLoads; i++) {
19511 for (unsigned j = 0; j < NumSubElts; j++) {
19512 LowMask[i * NumSubElts + j] = Lo++;
19513 HighMask[i * NumSubElts + j] = Hi++;
19514 }
19515 Lo += NumSubElts;
19516 Hi += NumSubElts;
19517 }
19518 SDLoc DL(N);
19519 SDValue Ext0, Ext1;
19520 // Extract the top and bottom lanes, then extend the result. Possibly extend
19521 // the result then extract the lanes if the two operands match as it produces
19522 // slightly smaller code.
19523 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
19525 NewOp, DAG.getConstant(0, DL, MVT::i64));
19526 SDValue SubH =
19527 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
19528 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
19529 SDValue Extr0 =
19530 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
19531 SDValue Extr1 =
19532 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
19533 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
19534 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
19535 } else {
19537 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
19538 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
19539 DAG.getConstant(0, DL, MVT::i64));
19540 SDValue SubH =
19541 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
19542 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
19543 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
19544 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
19545 }
19546 SDValue NShift =
19547 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
19548 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
19549}
19550
19553 // Try to change sum of two reductions.
19554 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
19555 return Val;
19556 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
19557 return Val;
19558 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
19559 return Val;
19560 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
19561 return Val;
19563 return Val;
19565 return Val;
19566 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
19567 return Val;
19568 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
19569 return Val;
19570 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
19571 return Val;
19572
19573 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
19574 return Val;
19575
19576 return performAddSubLongCombine(N, DCI);
19577}
19578
19579// Massage DAGs which we can use the high-half "long" operations on into
19580// something isel will recognize better. E.g.
19581//
19582// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
19583// (aarch64_neon_umull (extract_high (v2i64 vec)))
19584// (extract_high (v2i64 (dup128 scalar)))))
19585//
19588 SelectionDAG &DAG) {
19589 if (DCI.isBeforeLegalizeOps())
19590 return SDValue();
19591
19592 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
19593 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
19594 assert(LHS.getValueType().is64BitVector() &&
19595 RHS.getValueType().is64BitVector() &&
19596 "unexpected shape for long operation");
19597
19598 // Either node could be a DUP, but it's not worth doing both of them (you'd
19599 // just as well use the non-high version) so look for a corresponding extract
19600 // operation on the other "wing".
19603 if (!RHS.getNode())
19604 return SDValue();
19607 if (!LHS.getNode())
19608 return SDValue();
19609 } else
19610 return SDValue();
19611
19612 if (IID == Intrinsic::not_intrinsic)
19613 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
19614
19615 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
19616 N->getOperand(0), LHS, RHS);
19617}
19618
19619static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
19620 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
19621 unsigned ElemBits = ElemTy.getSizeInBits();
19622
19623 int64_t ShiftAmount;
19624 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
19625 APInt SplatValue, SplatUndef;
19626 unsigned SplatBitSize;
19627 bool HasAnyUndefs;
19628 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
19629 HasAnyUndefs, ElemBits) ||
19630 SplatBitSize != ElemBits)
19631 return SDValue();
19632
19633 ShiftAmount = SplatValue.getSExtValue();
19634 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
19635 ShiftAmount = CVN->getSExtValue();
19636 } else
19637 return SDValue();
19638
19639 // If the shift amount is zero, remove the shift intrinsic.
19640 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
19641 return N->getOperand(1);
19642
19643 unsigned Opcode;
19644 bool IsRightShift;
19645 switch (IID) {
19646 default:
19647 llvm_unreachable("Unknown shift intrinsic");
19648 case Intrinsic::aarch64_neon_sqshl:
19649 Opcode = AArch64ISD::SQSHL_I;
19650 IsRightShift = false;
19651 break;
19652 case Intrinsic::aarch64_neon_uqshl:
19653 Opcode = AArch64ISD::UQSHL_I;
19654 IsRightShift = false;
19655 break;
19656 case Intrinsic::aarch64_neon_srshl:
19657 Opcode = AArch64ISD::SRSHR_I;
19658 IsRightShift = true;
19659 break;
19660 case Intrinsic::aarch64_neon_urshl:
19661 Opcode = AArch64ISD::URSHR_I;
19662 IsRightShift = true;
19663 break;
19664 case Intrinsic::aarch64_neon_sqshlu:
19665 Opcode = AArch64ISD::SQSHLU_I;
19666 IsRightShift = false;
19667 break;
19668 case Intrinsic::aarch64_neon_sshl:
19669 case Intrinsic::aarch64_neon_ushl:
19670 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
19671 // left shift for positive shift amounts. For negative shifts we can use a
19672 // VASHR/VLSHR as appropiate.
19673 if (ShiftAmount < 0) {
19674 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
19676 ShiftAmount = -ShiftAmount;
19677 } else
19678 Opcode = AArch64ISD::VSHL;
19679 IsRightShift = false;
19680 break;
19681 }
19682
19683 EVT VT = N->getValueType(0);
19684 SDValue Op = N->getOperand(1);
19685 SDLoc dl(N);
19686 if (VT == MVT::i64) {
19687 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
19688 VT = MVT::v1i64;
19689 }
19690
19691 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
19692 Op = DAG.getNode(Opcode, dl, VT, Op,
19693 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
19694 if (N->getValueType(0) == MVT::i64)
19695 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
19696 DAG.getConstant(0, dl, MVT::i64));
19697 return Op;
19698 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
19699 Op = DAG.getNode(Opcode, dl, VT, Op,
19700 DAG.getConstant(ShiftAmount, dl, MVT::i32));
19701 if (N->getValueType(0) == MVT::i64)
19702 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
19703 DAG.getConstant(0, dl, MVT::i64));
19704 return Op;
19705 }
19706
19707 return SDValue();
19708}
19709
19710// The CRC32[BH] instructions ignore the high bits of their data operand. Since
19711// the intrinsics must be legal and take an i32, this means there's almost
19712// certainly going to be a zext in the DAG which we can eliminate.
19713static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
19714 SDValue AndN = N->getOperand(2);
19715 if (AndN.getOpcode() != ISD::AND)
19716 return SDValue();
19717
19718 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
19719 if (!CMask || CMask->getZExtValue() != Mask)
19720 return SDValue();
19721
19722 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
19723 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
19724}
19725
19727 SelectionDAG &DAG) {
19728 SDLoc dl(N);
19729 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
19730 DAG.getNode(Opc, dl,
19731 N->getOperand(1).getSimpleValueType(),
19732 N->getOperand(1)),
19733 DAG.getConstant(0, dl, MVT::i64));
19734}
19735
19737 SDLoc DL(N);
19738 SDValue Op1 = N->getOperand(1);
19739 SDValue Op2 = N->getOperand(2);
19740 EVT ScalarTy = Op2.getValueType();
19741 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
19742 ScalarTy = MVT::i32;
19743
19744 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
19745 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
19746 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
19747 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
19748 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
19749 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
19750}
19751
19753 SDLoc dl(N);
19754 SDValue Scalar = N->getOperand(3);
19755 EVT ScalarTy = Scalar.getValueType();
19756
19757 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
19758 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
19759
19760 SDValue Passthru = N->getOperand(1);
19761 SDValue Pred = N->getOperand(2);
19762 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
19763 Pred, Scalar, Passthru);
19764}
19765
19767 SDLoc dl(N);
19768 LLVMContext &Ctx = *DAG.getContext();
19769 EVT VT = N->getValueType(0);
19770
19771 assert(VT.isScalableVector() && "Expected a scalable vector.");
19772
19773 // Current lowering only supports the SVE-ACLE types.
19775 return SDValue();
19776
19777 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
19778 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
19779 EVT ByteVT =
19780 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
19781
19782 // Convert everything to the domain of EXT (i.e bytes).
19783 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
19784 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
19785 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
19786 DAG.getConstant(ElemSize, dl, MVT::i32));
19787
19788 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
19789 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
19790}
19791
19794 SelectionDAG &DAG) {
19795 if (DCI.isBeforeLegalize())
19796 return SDValue();
19797
19798 SDValue Comparator = N->getOperand(3);
19799 if (Comparator.getOpcode() == AArch64ISD::DUP ||
19800 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
19801 unsigned IID = getIntrinsicID(N);
19802 EVT VT = N->getValueType(0);
19803 EVT CmpVT = N->getOperand(2).getValueType();
19804 SDValue Pred = N->getOperand(1);
19805 SDValue Imm;
19806 SDLoc DL(N);
19807
19808 switch (IID) {
19809 default:
19810 llvm_unreachable("Called with wrong intrinsic!");
19811 break;
19812
19813 // Signed comparisons
19814 case Intrinsic::aarch64_sve_cmpeq_wide:
19815 case Intrinsic::aarch64_sve_cmpne_wide:
19816 case Intrinsic::aarch64_sve_cmpge_wide:
19817 case Intrinsic::aarch64_sve_cmpgt_wide:
19818 case Intrinsic::aarch64_sve_cmplt_wide:
19819 case Intrinsic::aarch64_sve_cmple_wide: {
19820 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
19821 int64_t ImmVal = CN->getSExtValue();
19822 if (ImmVal >= -16 && ImmVal <= 15)
19823 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
19824 else
19825 return SDValue();
19826 }
19827 break;
19828 }
19829 // Unsigned comparisons
19830 case Intrinsic::aarch64_sve_cmphs_wide:
19831 case Intrinsic::aarch64_sve_cmphi_wide:
19832 case Intrinsic::aarch64_sve_cmplo_wide:
19833 case Intrinsic::aarch64_sve_cmpls_wide: {
19834 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
19835 uint64_t ImmVal = CN->getZExtValue();
19836 if (ImmVal <= 127)
19837 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
19838 else
19839 return SDValue();
19840 }
19841 break;
19842 }
19843 }
19844
19845 if (!Imm)
19846 return SDValue();
19847
19848 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
19849 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
19850 N->getOperand(2), Splat, DAG.getCondCode(CC));
19851 }
19852
19853 return SDValue();
19854}
19855
19858 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19859
19860 SDLoc DL(Op);
19861 assert(Op.getValueType().isScalableVector() &&
19862 TLI.isTypeLegal(Op.getValueType()) &&
19863 "Expected legal scalable vector type!");
19864 assert(Op.getValueType() == Pg.getValueType() &&
19865 "Expected same type for PTEST operands");
19866
19867 // Ensure target specific opcodes are using legal type.
19868 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
19869 SDValue TVal = DAG.getConstant(1, DL, OutVT);
19870 SDValue FVal = DAG.getConstant(0, DL, OutVT);
19871
19872 // Ensure operands have type nxv16i1.
19873 if (Op.getValueType() != MVT::nxv16i1) {
19876 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
19877 else
19878 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
19879 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
19880 }
19881
19882 // Set condition code (CC) flags.
19883 SDValue Test = DAG.getNode(
19885 DL, MVT::Other, Pg, Op);
19886
19887 // Convert CC to integer based on requested condition.
19888 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
19889 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
19890 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
19891 return DAG.getZExtOrTrunc(Res, DL, VT);
19892}
19893
19895 SelectionDAG &DAG) {
19896 SDLoc DL(N);
19897
19898 SDValue Pred = N->getOperand(1);
19899 SDValue VecToReduce = N->getOperand(2);
19900
19901 // NOTE: The integer reduction's result type is not always linked to the
19902 // operand's element type so we construct it from the intrinsic's result type.
19903 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
19904 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
19905
19906 // SVE reductions set the whole vector register with the first element
19907 // containing the reduction result, which we'll now extract.
19908 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
19909 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
19910 Zero);
19911}
19912
19914 SelectionDAG &DAG) {
19915 SDLoc DL(N);
19916
19917 SDValue Pred = N->getOperand(1);
19918 SDValue VecToReduce = N->getOperand(2);
19919
19920 EVT ReduceVT = VecToReduce.getValueType();
19921 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
19922
19923 // SVE reductions set the whole vector register with the first element
19924 // containing the reduction result, which we'll now extract.
19925 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
19926 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
19927 Zero);
19928}
19929
19931 SelectionDAG &DAG) {
19932 SDLoc DL(N);
19933
19934 SDValue Pred = N->getOperand(1);
19935 SDValue InitVal = N->getOperand(2);
19936 SDValue VecToReduce = N->getOperand(3);
19937 EVT ReduceVT = VecToReduce.getValueType();
19938
19939 // Ordered reductions use the first lane of the result vector as the
19940 // reduction's initial value.
19941 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
19942 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
19943 DAG.getUNDEF(ReduceVT), InitVal, Zero);
19944
19945 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
19946
19947 // SVE reductions set the whole vector register with the first element
19948 // containing the reduction result, which we'll now extract.
19949 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
19950 Zero);
19951}
19952
19953// If a merged operation has no inactive lanes we can relax it to a predicated
19954// or unpredicated operation, which potentially allows better isel (perhaps
19955// using immediate forms) or relaxing register reuse requirements.
19957 SelectionDAG &DAG, bool UnpredOp = false,
19958 bool SwapOperands = false) {
19959 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
19960 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
19961 SDValue Pg = N->getOperand(1);
19962 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
19963 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
19964
19965 // ISD way to specify an all active predicate.
19966 if (isAllActivePredicate(DAG, Pg)) {
19967 if (UnpredOp)
19968 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
19969
19970 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
19971 }
19972
19973 // FUTURE: SplatVector(true)
19974 return SDValue();
19975}
19976
19979 const AArch64Subtarget *Subtarget) {
19980 SelectionDAG &DAG = DCI.DAG;
19981 unsigned IID = getIntrinsicID(N);
19982 switch (IID) {
19983 default:
19984 break;
19985 case Intrinsic::get_active_lane_mask: {
19986 SDValue Res = SDValue();
19987 EVT VT = N->getValueType(0);
19988 if (VT.isFixedLengthVector()) {
19989 // We can use the SVE whilelo instruction to lower this intrinsic by
19990 // creating the appropriate sequence of scalable vector operations and
19991 // then extracting a fixed-width subvector from the scalable vector.
19992
19993 SDLoc DL(N);
19994 SDValue ID =
19995 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
19996
19997 EVT WhileVT = EVT::getVectorVT(
19998 *DAG.getContext(), MVT::i1,
20000
20001 // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
20002 EVT PromVT = getPromotedVTForPredicate(WhileVT);
20003
20004 // Get the fixed-width equivalent of PromVT for extraction.
20005 EVT ExtVT =
20008
20009 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
20010 N->getOperand(1), N->getOperand(2));
20011 Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
20012 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
20013 DAG.getConstant(0, DL, MVT::i64));
20014 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
20015 }
20016 return Res;
20017 }
20018 case Intrinsic::aarch64_neon_vcvtfxs2fp:
20019 case Intrinsic::aarch64_neon_vcvtfxu2fp:
20020 return tryCombineFixedPointConvert(N, DCI, DAG);
20021 case Intrinsic::aarch64_neon_saddv:
20023 case Intrinsic::aarch64_neon_uaddv:
20025 case Intrinsic::aarch64_neon_sminv:
20027 case Intrinsic::aarch64_neon_uminv:
20029 case Intrinsic::aarch64_neon_smaxv:
20031 case Intrinsic::aarch64_neon_umaxv:
20033 case Intrinsic::aarch64_neon_fmax:
20034 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
20035 N->getOperand(1), N->getOperand(2));
20036 case Intrinsic::aarch64_neon_fmin:
20037 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
20038 N->getOperand(1), N->getOperand(2));
20039 case Intrinsic::aarch64_neon_fmaxnm:
20040 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
20041 N->getOperand(1), N->getOperand(2));
20042 case Intrinsic::aarch64_neon_fminnm:
20043 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
20044 N->getOperand(1), N->getOperand(2));
20045 case Intrinsic::aarch64_neon_smull:
20046 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
20047 N->getOperand(1), N->getOperand(2));
20048 case Intrinsic::aarch64_neon_umull:
20049 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
20050 N->getOperand(1), N->getOperand(2));
20051 case Intrinsic::aarch64_neon_pmull:
20052 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
20053 N->getOperand(1), N->getOperand(2));
20054 case Intrinsic::aarch64_neon_sqdmull:
20055 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
20056 case Intrinsic::aarch64_neon_sqshl:
20057 case Intrinsic::aarch64_neon_uqshl:
20058 case Intrinsic::aarch64_neon_sqshlu:
20059 case Intrinsic::aarch64_neon_srshl:
20060 case Intrinsic::aarch64_neon_urshl:
20061 case Intrinsic::aarch64_neon_sshl:
20062 case Intrinsic::aarch64_neon_ushl:
20063 return tryCombineShiftImm(IID, N, DAG);
20064 case Intrinsic::aarch64_neon_sabd:
20065 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20066 N->getOperand(1), N->getOperand(2));
20067 case Intrinsic::aarch64_neon_uabd:
20068 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20069 N->getOperand(1), N->getOperand(2));
20070 case Intrinsic::aarch64_crc32b:
20071 case Intrinsic::aarch64_crc32cb:
20072 return tryCombineCRC32(0xff, N, DAG);
20073 case Intrinsic::aarch64_crc32h:
20074 case Intrinsic::aarch64_crc32ch:
20075 return tryCombineCRC32(0xffff, N, DAG);
20076 case Intrinsic::aarch64_sve_saddv:
20077 // There is no i64 version of SADDV because the sign is irrelevant.
20078 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
20080 else
20082 case Intrinsic::aarch64_sve_uaddv:
20084 case Intrinsic::aarch64_sve_smaxv:
20086 case Intrinsic::aarch64_sve_umaxv:
20088 case Intrinsic::aarch64_sve_sminv:
20090 case Intrinsic::aarch64_sve_uminv:
20092 case Intrinsic::aarch64_sve_orv:
20094 case Intrinsic::aarch64_sve_eorv:
20096 case Intrinsic::aarch64_sve_andv:
20098 case Intrinsic::aarch64_sve_index:
20099 return LowerSVEIntrinsicIndex(N, DAG);
20100 case Intrinsic::aarch64_sve_dup:
20101 return LowerSVEIntrinsicDUP(N, DAG);
20102 case Intrinsic::aarch64_sve_dup_x:
20103 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
20104 N->getOperand(1));
20105 case Intrinsic::aarch64_sve_ext:
20106 return LowerSVEIntrinsicEXT(N, DAG);
20107 case Intrinsic::aarch64_sve_mul_u:
20108 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
20109 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20110 case Intrinsic::aarch64_sve_smulh_u:
20111 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
20112 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20113 case Intrinsic::aarch64_sve_umulh_u:
20114 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
20115 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20116 case Intrinsic::aarch64_sve_smin_u:
20117 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
20118 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20119 case Intrinsic::aarch64_sve_umin_u:
20120 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
20121 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20122 case Intrinsic::aarch64_sve_smax_u:
20123 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
20124 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20125 case Intrinsic::aarch64_sve_umax_u:
20126 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
20127 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20128 case Intrinsic::aarch64_sve_lsl_u:
20129 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
20130 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20131 case Intrinsic::aarch64_sve_lsr_u:
20132 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
20133 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20134 case Intrinsic::aarch64_sve_asr_u:
20135 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
20136 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20137 case Intrinsic::aarch64_sve_fadd_u:
20138 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
20139 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20140 case Intrinsic::aarch64_sve_fdiv_u:
20141 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
20142 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20143 case Intrinsic::aarch64_sve_fmax_u:
20144 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
20145 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20146 case Intrinsic::aarch64_sve_fmaxnm_u:
20147 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
20148 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20149 case Intrinsic::aarch64_sve_fmla_u:
20150 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
20151 N->getOperand(1), N->getOperand(3), N->getOperand(4),
20152 N->getOperand(2));
20153 case Intrinsic::aarch64_sve_fmin_u:
20154 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
20155 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20156 case Intrinsic::aarch64_sve_fminnm_u:
20157 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
20158 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20159 case Intrinsic::aarch64_sve_fmul_u:
20160 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
20161 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20162 case Intrinsic::aarch64_sve_fsub_u:
20163 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
20164 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20165 case Intrinsic::aarch64_sve_add_u:
20166 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
20167 N->getOperand(3));
20168 case Intrinsic::aarch64_sve_sub_u:
20169 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
20170 N->getOperand(3));
20171 case Intrinsic::aarch64_sve_subr:
20172 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
20173 case Intrinsic::aarch64_sve_and_u:
20174 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
20175 N->getOperand(3));
20176 case Intrinsic::aarch64_sve_bic_u:
20177 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
20178 N->getOperand(2), N->getOperand(3));
20179 case Intrinsic::aarch64_sve_eor_u:
20180 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20181 N->getOperand(3));
20182 case Intrinsic::aarch64_sve_orr_u:
20183 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20184 N->getOperand(3));
20185 case Intrinsic::aarch64_sve_sabd_u:
20186 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20187 N->getOperand(2), N->getOperand(3));
20188 case Intrinsic::aarch64_sve_uabd_u:
20189 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20190 N->getOperand(2), N->getOperand(3));
20191 case Intrinsic::aarch64_sve_sdiv_u:
20192 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
20193 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20194 case Intrinsic::aarch64_sve_udiv_u:
20195 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
20196 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20197 case Intrinsic::aarch64_sve_sqadd:
20198 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
20199 case Intrinsic::aarch64_sve_sqsub_u:
20200 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20201 N->getOperand(2), N->getOperand(3));
20202 case Intrinsic::aarch64_sve_uqadd:
20203 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
20204 case Intrinsic::aarch64_sve_uqsub_u:
20205 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20206 N->getOperand(2), N->getOperand(3));
20207 case Intrinsic::aarch64_sve_sqadd_x:
20208 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
20209 N->getOperand(1), N->getOperand(2));
20210 case Intrinsic::aarch64_sve_sqsub_x:
20211 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20212 N->getOperand(1), N->getOperand(2));
20213 case Intrinsic::aarch64_sve_uqadd_x:
20214 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
20215 N->getOperand(1), N->getOperand(2));
20216 case Intrinsic::aarch64_sve_uqsub_x:
20217 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20218 N->getOperand(1), N->getOperand(2));
20219 case Intrinsic::aarch64_sve_asrd:
20220 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
20221 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20222 case Intrinsic::aarch64_sve_cmphs:
20223 if (!N->getOperand(2).getValueType().isFloatingPoint())
20225 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20226 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
20227 break;
20228 case Intrinsic::aarch64_sve_cmphi:
20229 if (!N->getOperand(2).getValueType().isFloatingPoint())
20231 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20232 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
20233 break;
20234 case Intrinsic::aarch64_sve_fcmpge:
20235 case Intrinsic::aarch64_sve_cmpge:
20237 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20238 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
20239 break;
20240 case Intrinsic::aarch64_sve_fcmpgt:
20241 case Intrinsic::aarch64_sve_cmpgt:
20243 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20244 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
20245 break;
20246 case Intrinsic::aarch64_sve_fcmpeq:
20247 case Intrinsic::aarch64_sve_cmpeq:
20249 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20250 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
20251 break;
20252 case Intrinsic::aarch64_sve_fcmpne:
20253 case Intrinsic::aarch64_sve_cmpne:
20255 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20256 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
20257 break;
20258 case Intrinsic::aarch64_sve_fcmpuo:
20260 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20261 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
20262 break;
20263 case Intrinsic::aarch64_sve_fadda:
20265 case Intrinsic::aarch64_sve_faddv:
20267 case Intrinsic::aarch64_sve_fmaxnmv:
20269 case Intrinsic::aarch64_sve_fmaxv:
20271 case Intrinsic::aarch64_sve_fminnmv:
20273 case Intrinsic::aarch64_sve_fminv:
20275 case Intrinsic::aarch64_sve_sel:
20276 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
20277 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20278 case Intrinsic::aarch64_sve_cmpeq_wide:
20279 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
20280 case Intrinsic::aarch64_sve_cmpne_wide:
20281 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
20282 case Intrinsic::aarch64_sve_cmpge_wide:
20283 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
20284 case Intrinsic::aarch64_sve_cmpgt_wide:
20285 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
20286 case Intrinsic::aarch64_sve_cmplt_wide:
20287 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
20288 case Intrinsic::aarch64_sve_cmple_wide:
20289 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
20290 case Intrinsic::aarch64_sve_cmphs_wide:
20291 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
20292 case Intrinsic::aarch64_sve_cmphi_wide:
20293 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
20294 case Intrinsic::aarch64_sve_cmplo_wide:
20295 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
20296 case Intrinsic::aarch64_sve_cmpls_wide:
20297 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
20298 case Intrinsic::aarch64_sve_ptest_any:
20299 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20301 case Intrinsic::aarch64_sve_ptest_first:
20302 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20304 case Intrinsic::aarch64_sve_ptest_last:
20305 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20307 }
20308 return SDValue();
20309}
20310
20311static bool isCheapToExtend(const SDValue &N) {
20312 unsigned OC = N->getOpcode();
20313 return OC == ISD::LOAD || OC == ISD::MLOAD ||
20315}
20316
20317static SDValue
20319 SelectionDAG &DAG) {
20320 // If we have (sext (setcc A B)) and A and B are cheap to extend,
20321 // we can move the sext into the arguments and have the same result. For
20322 // example, if A and B are both loads, we can make those extending loads and
20323 // avoid an extra instruction. This pattern appears often in VLS code
20324 // generation where the inputs to the setcc have a different size to the
20325 // instruction that wants to use the result of the setcc.
20326 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
20327 N->getOperand(0)->getOpcode() == ISD::SETCC);
20328 const SDValue SetCC = N->getOperand(0);
20329
20330 const SDValue CCOp0 = SetCC.getOperand(0);
20331 const SDValue CCOp1 = SetCC.getOperand(1);
20332 if (!CCOp0->getValueType(0).isInteger() ||
20333 !CCOp1->getValueType(0).isInteger())
20334 return SDValue();
20335
20336 ISD::CondCode Code =
20337 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
20338
20339 ISD::NodeType ExtType =
20340 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20341
20342 if (isCheapToExtend(SetCC.getOperand(0)) &&
20343 isCheapToExtend(SetCC.getOperand(1))) {
20344 const SDValue Ext1 =
20345 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
20346 const SDValue Ext2 =
20347 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
20348
20349 return DAG.getSetCC(
20350 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
20351 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
20352 }
20353
20354 return SDValue();
20355}
20356
20359 SelectionDAG &DAG) {
20360 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
20361 // we can convert that DUP into another extract_high (of a bigger DUP), which
20362 // helps the backend to decide that an sabdl2 would be useful, saving a real
20363 // extract_high operation.
20364 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
20365 (N->getOperand(0).getOpcode() == ISD::ABDU ||
20366 N->getOperand(0).getOpcode() == ISD::ABDS)) {
20367 SDNode *ABDNode = N->getOperand(0).getNode();
20368 SDValue NewABD =
20370 if (!NewABD.getNode())
20371 return SDValue();
20372
20373 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
20374 }
20375
20376 if (N->getValueType(0).isFixedLengthVector() &&
20377 N->getOpcode() == ISD::SIGN_EXTEND &&
20378 N->getOperand(0)->getOpcode() == ISD::SETCC)
20379 return performSignExtendSetCCCombine(N, DCI, DAG);
20380
20381 return SDValue();
20382}
20383
20385 SDValue SplatVal, unsigned NumVecElts) {
20386 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
20387 Align OrigAlignment = St.getAlign();
20388 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
20389
20390 // Create scalar stores. This is at least as good as the code sequence for a
20391 // split unaligned store which is a dup.s, ext.b, and two stores.
20392 // Most of the time the three stores should be replaced by store pair
20393 // instructions (stp).
20394 SDLoc DL(&St);
20395 SDValue BasePtr = St.getBasePtr();
20396 uint64_t BaseOffset = 0;
20397
20398 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
20399 SDValue NewST1 =
20400 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
20401 OrigAlignment, St.getMemOperand()->getFlags());
20402
20403 // As this in ISel, we will not merge this add which may degrade results.
20404 if (BasePtr->getOpcode() == ISD::ADD &&
20405 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
20406 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
20407 BasePtr = BasePtr->getOperand(0);
20408 }
20409
20410 unsigned Offset = EltOffset;
20411 while (--NumVecElts) {
20412 Align Alignment = commonAlignment(OrigAlignment, Offset);
20413 SDValue OffsetPtr =
20414 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
20415 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
20416 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
20417 PtrInfo.getWithOffset(Offset), Alignment,
20418 St.getMemOperand()->getFlags());
20419 Offset += EltOffset;
20420 }
20421 return NewST1;
20422}
20423
20424// Returns an SVE type that ContentTy can be trivially sign or zero extended
20425// into.
20426static MVT getSVEContainerType(EVT ContentTy) {
20427 assert(ContentTy.isSimple() && "No SVE containers for extended types");
20428
20429 switch (ContentTy.getSimpleVT().SimpleTy) {
20430 default:
20431 llvm_unreachable("No known SVE container for this MVT type");
20432 case MVT::nxv2i8:
20433 case MVT::nxv2i16:
20434 case MVT::nxv2i32:
20435 case MVT::nxv2i64:
20436 case MVT::nxv2f32:
20437 case MVT::nxv2f64:
20438 return MVT::nxv2i64;
20439 case MVT::nxv4i8:
20440 case MVT::nxv4i16:
20441 case MVT::nxv4i32:
20442 case MVT::nxv4f32:
20443 return MVT::nxv4i32;
20444 case MVT::nxv8i8:
20445 case MVT::nxv8i16:
20446 case MVT::nxv8f16:
20447 case MVT::nxv8bf16:
20448 return MVT::nxv8i16;
20449 case MVT::nxv16i8:
20450 return MVT::nxv16i8;
20451 }
20452}
20453
20454static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
20455 SDLoc DL(N);
20456 EVT VT = N->getValueType(0);
20457
20459 return SDValue();
20460
20461 EVT ContainerVT = VT;
20462 if (ContainerVT.isInteger())
20463 ContainerVT = getSVEContainerType(ContainerVT);
20464
20465 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
20466 SDValue Ops[] = { N->getOperand(0), // Chain
20467 N->getOperand(2), // Pg
20468 N->getOperand(3), // Base
20469 DAG.getValueType(VT) };
20470
20471 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
20472 SDValue LoadChain = SDValue(Load.getNode(), 1);
20473
20474 if (ContainerVT.isInteger() && (VT != ContainerVT))
20475 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
20476
20477 return DAG.getMergeValues({ Load, LoadChain }, DL);
20478}
20479
20481 SDLoc DL(N);
20482 EVT VT = N->getValueType(0);
20483 EVT PtrTy = N->getOperand(3).getValueType();
20484
20485 EVT LoadVT = VT;
20486 if (VT.isFloatingPoint())
20487 LoadVT = VT.changeTypeToInteger();
20488
20489 auto *MINode = cast<MemIntrinsicSDNode>(N);
20490 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
20491 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
20492 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
20493 MINode->getOperand(2), PassThru,
20494 MINode->getMemoryVT(), MINode->getMemOperand(),
20496
20497 if (VT.isFloatingPoint()) {
20498 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
20499 return DAG.getMergeValues(Ops, DL);
20500 }
20501
20502 return L;
20503}
20504
20505template <unsigned Opcode>
20507 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
20509 "Unsupported opcode.");
20510 SDLoc DL(N);
20511 EVT VT = N->getValueType(0);
20512
20513 EVT LoadVT = VT;
20514 if (VT.isFloatingPoint())
20515 LoadVT = VT.changeTypeToInteger();
20516
20517 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
20518 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
20519 SDValue LoadChain = SDValue(Load.getNode(), 1);
20520
20521 if (VT.isFloatingPoint())
20522 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
20523
20524 return DAG.getMergeValues({Load, LoadChain}, DL);
20525}
20526
20528 SDLoc DL(N);
20529 SDValue Data = N->getOperand(2);
20530 EVT DataVT = Data.getValueType();
20531 EVT HwSrcVt = getSVEContainerType(DataVT);
20532 SDValue InputVT = DAG.getValueType(DataVT);
20533
20534 if (DataVT.isFloatingPoint())
20535 InputVT = DAG.getValueType(HwSrcVt);
20536
20537 SDValue SrcNew;
20538 if (Data.getValueType().isFloatingPoint())
20539 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
20540 else
20541 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
20542
20543 SDValue Ops[] = { N->getOperand(0), // Chain
20544 SrcNew,
20545 N->getOperand(4), // Base
20546 N->getOperand(3), // Pg
20547 InputVT
20548 };
20549
20550 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
20551}
20552
20554 SDLoc DL(N);
20555
20556 SDValue Data = N->getOperand(2);
20557 EVT DataVT = Data.getValueType();
20558 EVT PtrTy = N->getOperand(4).getValueType();
20559
20560 if (DataVT.isFloatingPoint())
20562
20563 auto *MINode = cast<MemIntrinsicSDNode>(N);
20564 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
20565 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
20566 MINode->getMemoryVT(), MINode->getMemOperand(),
20567 ISD::UNINDEXED, false, false);
20568}
20569
20570/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
20571/// load store optimizer pass will merge them to store pair stores. This should
20572/// be better than a movi to create the vector zero followed by a vector store
20573/// if the zero constant is not re-used, since one instructions and one register
20574/// live range will be removed.
20575///
20576/// For example, the final generated code should be:
20577///
20578/// stp xzr, xzr, [x0]
20579///
20580/// instead of:
20581///
20582/// movi v0.2d, #0
20583/// str q0, [x0]
20584///
20586 SDValue StVal = St.getValue();
20587 EVT VT = StVal.getValueType();
20588
20589 // Avoid scalarizing zero splat stores for scalable vectors.
20590 if (VT.isScalableVector())
20591 return SDValue();
20592
20593 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
20594 // 2, 3 or 4 i32 elements.
20595 int NumVecElts = VT.getVectorNumElements();
20596 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
20597 VT.getVectorElementType().getSizeInBits() == 64) ||
20598 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
20599 VT.getVectorElementType().getSizeInBits() == 32)))
20600 return SDValue();
20601
20602 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
20603 return SDValue();
20604
20605 // If the zero constant has more than one use then the vector store could be
20606 // better since the constant mov will be amortized and stp q instructions
20607 // should be able to be formed.
20608 if (!StVal.hasOneUse())
20609 return SDValue();
20610
20611 // If the store is truncating then it's going down to i16 or smaller, which
20612 // means it can be implemented in a single store anyway.
20613 if (St.isTruncatingStore())
20614 return SDValue();
20615
20616 // If the immediate offset of the address operand is too large for the stp
20617 // instruction, then bail out.
20618 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
20619 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
20620 if (Offset < -512 || Offset > 504)
20621 return SDValue();
20622 }
20623
20624 for (int I = 0; I < NumVecElts; ++I) {
20625 SDValue EltVal = StVal.getOperand(I);
20626 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
20627 return SDValue();
20628 }
20629
20630 // Use a CopyFromReg WZR/XZR here to prevent
20631 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
20632 SDLoc DL(&St);
20633 unsigned ZeroReg;
20634 EVT ZeroVT;
20635 if (VT.getVectorElementType().getSizeInBits() == 32) {
20636 ZeroReg = AArch64::WZR;
20637 ZeroVT = MVT::i32;
20638 } else {
20639 ZeroReg = AArch64::XZR;
20640 ZeroVT = MVT::i64;
20641 }
20642 SDValue SplatVal =
20643 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
20644 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
20645}
20646
20647/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
20648/// value. The load store optimizer pass will merge them to store pair stores.
20649/// This has better performance than a splat of the scalar followed by a split
20650/// vector store. Even if the stores are not merged it is four stores vs a dup,
20651/// followed by an ext.b and two stores.
20653 SDValue StVal = St.getValue();
20654 EVT VT = StVal.getValueType();
20655
20656 // Don't replace floating point stores, they possibly won't be transformed to
20657 // stp because of the store pair suppress pass.
20658 if (VT.isFloatingPoint())
20659 return SDValue();
20660
20661 // We can express a splat as store pair(s) for 2 or 4 elements.
20662 unsigned NumVecElts = VT.getVectorNumElements();
20663 if (NumVecElts != 4 && NumVecElts != 2)
20664 return SDValue();
20665
20666 // If the store is truncating then it's going down to i16 or smaller, which
20667 // means it can be implemented in a single store anyway.
20668 if (St.isTruncatingStore())
20669 return SDValue();
20670
20671 // Check that this is a splat.
20672 // Make sure that each of the relevant vector element locations are inserted
20673 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
20674 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
20675 SDValue SplatVal;
20676 for (unsigned I = 0; I < NumVecElts; ++I) {
20677 // Check for insert vector elements.
20678 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
20679 return SDValue();
20680
20681 // Check that same value is inserted at each vector element.
20682 if (I == 0)
20683 SplatVal = StVal.getOperand(1);
20684 else if (StVal.getOperand(1) != SplatVal)
20685 return SDValue();
20686
20687 // Check insert element index.
20688 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
20689 if (!CIndex)
20690 return SDValue();
20691 uint64_t IndexVal = CIndex->getZExtValue();
20692 if (IndexVal >= NumVecElts)
20693 return SDValue();
20694 IndexNotInserted.reset(IndexVal);
20695
20696 StVal = StVal.getOperand(0);
20697 }
20698 // Check that all vector element locations were inserted to.
20699 if (IndexNotInserted.any())
20700 return SDValue();
20701
20702 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
20703}
20704
20706 SelectionDAG &DAG,
20707 const AArch64Subtarget *Subtarget) {
20708
20709 StoreSDNode *S = cast<StoreSDNode>(N);
20710 if (S->isVolatile() || S->isIndexed())
20711 return SDValue();
20712
20713 SDValue StVal = S->getValue();
20714 EVT VT = StVal.getValueType();
20715
20716 if (!VT.isFixedLengthVector())
20717 return SDValue();
20718
20719 // If we get a splat of zeros, convert this vector store to a store of
20720 // scalars. They will be merged into store pairs of xzr thereby removing one
20721 // instruction and one register.
20722 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
20723 return ReplacedZeroSplat;
20724
20725 // FIXME: The logic for deciding if an unaligned store should be split should
20726 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
20727 // a call to that function here.
20728
20729 if (!Subtarget->isMisaligned128StoreSlow())
20730 return SDValue();
20731
20732 // Don't split at -Oz.
20734 return SDValue();
20735
20736 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
20737 // those up regresses performance on micro-benchmarks and olden/bh.
20738 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
20739 return SDValue();
20740
20741 // Split unaligned 16B stores. They are terrible for performance.
20742 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
20743 // extensions can use this to mark that it does not want splitting to happen
20744 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
20745 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
20746 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
20747 S->getAlign() <= Align(2))
20748 return SDValue();
20749
20750 // If we get a splat of a scalar convert this vector store to a store of
20751 // scalars. They will be merged into store pairs thereby removing two
20752 // instructions.
20753 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
20754 return ReplacedSplat;
20755
20756 SDLoc DL(S);
20757
20758 // Split VT into two.
20759 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
20760 unsigned NumElts = HalfVT.getVectorNumElements();
20761 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
20762 DAG.getConstant(0, DL, MVT::i64));
20763 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
20764 DAG.getConstant(NumElts, DL, MVT::i64));
20765 SDValue BasePtr = S->getBasePtr();
20766 SDValue NewST1 =
20767 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
20768 S->getAlign(), S->getMemOperand()->getFlags());
20769 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
20770 DAG.getConstant(8, DL, MVT::i64));
20771 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
20772 S->getPointerInfo(), S->getAlign(),
20773 S->getMemOperand()->getFlags());
20774}
20775
20777 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
20778
20779 // splice(pg, op1, undef) -> op1
20780 if (N->getOperand(2).isUndef())
20781 return N->getOperand(1);
20782
20783 return SDValue();
20784}
20785
20787 const AArch64Subtarget *Subtarget) {
20788 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
20789 N->getOpcode() == AArch64ISD::UUNPKLO) &&
20790 "Unexpected Opcode!");
20791
20792 // uunpklo/hi undef -> undef
20793 if (N->getOperand(0).isUndef())
20794 return DAG.getUNDEF(N->getValueType(0));
20795
20796 // If this is a masked load followed by an UUNPKLO, fold this into a masked
20797 // extending load. We can do this even if this is already a masked
20798 // {z,}extload.
20799 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
20800 N->getOpcode() == AArch64ISD::UUNPKLO) {
20801 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
20802 SDValue Mask = MLD->getMask();
20803 SDLoc DL(N);
20804
20805 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
20806 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
20807 (MLD->getPassThru()->isUndef() ||
20808 isZerosVector(MLD->getPassThru().getNode()))) {
20809 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
20810 unsigned PgPattern = Mask->getConstantOperandVal(0);
20811 EVT VT = N->getValueType(0);
20812
20813 // Ensure we can double the size of the predicate pattern
20814 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
20815 if (NumElts &&
20816 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
20817 Mask =
20818 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
20819 SDValue PassThru = DAG.getConstant(0, DL, VT);
20820 SDValue NewLoad = DAG.getMaskedLoad(
20821 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
20822 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
20824
20825 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
20826
20827 return NewLoad;
20828 }
20829 }
20830 }
20831
20832 return SDValue();
20833}
20834
20835// Try to simplify:
20836// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
20837// t2 = nxv8i16 srl(t1, ShiftValue)
20838// to
20839// t1 = nxv8i16 rshrnb(X, shiftvalue).
20840// rshrnb will zero the top half bits of each element. Therefore, this combine
20841// should only be performed when a following instruction with the rshrnb
20842// as an operand does not care about the top half of each element. For example,
20843// a uzp1 or a truncating store.
20845 const AArch64Subtarget *Subtarget) {
20846 EVT VT = Srl->getValueType(0);
20847
20848 if (!VT.isScalableVector() || !Subtarget->hasSVE2() ||
20849 Srl->getOpcode() != ISD::SRL)
20850 return SDValue();
20851
20852 EVT ResVT;
20853 if (VT == MVT::nxv8i16)
20854 ResVT = MVT::nxv16i8;
20855 else if (VT == MVT::nxv4i32)
20856 ResVT = MVT::nxv8i16;
20857 else if (VT == MVT::nxv2i64)
20858 ResVT = MVT::nxv4i32;
20859 else
20860 return SDValue();
20861
20862 auto SrlOp1 =
20863 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Srl->getOperand(1)));
20864 if (!SrlOp1)
20865 return SDValue();
20866 unsigned ShiftValue = SrlOp1->getZExtValue();
20867 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
20868 return SDValue();
20869
20870 SDValue Add = Srl->getOperand(0);
20871 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
20872 return SDValue();
20873 auto AddOp1 =
20874 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
20875 if (!AddOp1)
20876 return SDValue();
20877 uint64_t AddValue = AddOp1->getZExtValue();
20878 if (AddValue != 1ULL << (ShiftValue - 1))
20879 return SDValue();
20880
20881 SDLoc DL(Srl);
20882 SDValue Rshrnb = DAG.getNode(
20883 AArch64ISD::RSHRNB_I, DL, ResVT,
20884 {Add->getOperand(0), DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
20885 return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
20886}
20887
20889 const AArch64Subtarget *Subtarget) {
20890 SDLoc DL(N);
20891 SDValue Op0 = N->getOperand(0);
20892 SDValue Op1 = N->getOperand(1);
20893 EVT ResVT = N->getValueType(0);
20894
20895 // uzp1(x, undef) -> concat(truncate(x), undef)
20896 if (Op1.getOpcode() == ISD::UNDEF) {
20897 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
20898 switch (ResVT.getSimpleVT().SimpleTy) {
20899 default:
20900 break;
20901 case MVT::v16i8:
20902 BCVT = MVT::v8i16;
20903 HalfVT = MVT::v8i8;
20904 break;
20905 case MVT::v8i16:
20906 BCVT = MVT::v4i32;
20907 HalfVT = MVT::v4i16;
20908 break;
20909 case MVT::v4i32:
20910 BCVT = MVT::v2i64;
20911 HalfVT = MVT::v2i32;
20912 break;
20913 }
20914 if (BCVT != MVT::Other) {
20915 SDValue BC = DAG.getBitcast(BCVT, Op0);
20916 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
20917 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
20918 DAG.getUNDEF(HalfVT));
20919 }
20920 }
20921
20922 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
20923 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
20924
20925 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
20926 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
20927
20928 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
20929 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
20930 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
20931 SDValue X = Op0.getOperand(0).getOperand(0);
20932 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
20933 }
20934 }
20935
20936 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
20937 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
20938 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
20939 SDValue Z = Op1.getOperand(0).getOperand(1);
20940 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
20941 }
20942 }
20943
20944 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
20945 // Only implemented on little-endian subtargets.
20946 bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
20947
20948 // This optimization only works on little endian.
20949 if (!IsLittleEndian)
20950 return SDValue();
20951
20952 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
20953 return SDValue();
20954
20955 auto getSourceOp = [](SDValue Operand) -> SDValue {
20956 const unsigned Opcode = Operand.getOpcode();
20957 if (Opcode == ISD::TRUNCATE)
20958 return Operand->getOperand(0);
20959 if (Opcode == ISD::BITCAST &&
20960 Operand->getOperand(0).getOpcode() == ISD::TRUNCATE)
20961 return Operand->getOperand(0)->getOperand(0);
20962 return SDValue();
20963 };
20964
20965 SDValue SourceOp0 = getSourceOp(Op0);
20966 SDValue SourceOp1 = getSourceOp(Op1);
20967
20968 if (!SourceOp0 || !SourceOp1)
20969 return SDValue();
20970
20971 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
20972 !SourceOp0.getValueType().isSimple())
20973 return SDValue();
20974
20975 EVT ResultTy;
20976
20977 switch (SourceOp0.getSimpleValueType().SimpleTy) {
20978 case MVT::v2i64:
20979 ResultTy = MVT::v4i32;
20980 break;
20981 case MVT::v4i32:
20982 ResultTy = MVT::v8i16;
20983 break;
20984 case MVT::v8i16:
20985 ResultTy = MVT::v16i8;
20986 break;
20987 default:
20988 return SDValue();
20989 }
20990
20991 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
20992 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
20993 SDValue UzpResult =
20994 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
20995
20996 EVT BitcastResultTy;
20997
20998 switch (ResVT.getSimpleVT().SimpleTy) {
20999 case MVT::v2i32:
21000 BitcastResultTy = MVT::v2i64;
21001 break;
21002 case MVT::v4i16:
21003 BitcastResultTy = MVT::v4i32;
21004 break;
21005 case MVT::v8i8:
21006 BitcastResultTy = MVT::v8i16;
21007 break;
21008 default:
21009 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
21010 }
21011
21012 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
21013 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
21014}
21015
21017 unsigned Opc = N->getOpcode();
21018
21019 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
21021 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
21023 "Invalid opcode.");
21024
21025 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
21027 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
21029 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
21033
21034 SDLoc DL(N);
21035 SDValue Chain = N->getOperand(0);
21036 SDValue Pg = N->getOperand(1);
21037 SDValue Base = N->getOperand(2);
21038 SDValue Offset = N->getOperand(3);
21039 SDValue Ty = N->getOperand(4);
21040
21041 EVT ResVT = N->getValueType(0);
21042
21043 const auto OffsetOpc = Offset.getOpcode();
21044 const bool OffsetIsZExt =
21046 const bool OffsetIsSExt =
21048
21049 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
21050 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
21051 SDValue ExtPg = Offset.getOperand(0);
21052 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
21053 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
21054
21055 // If the predicate for the sign- or zero-extended offset is the
21056 // same as the predicate used for this load and the sign-/zero-extension
21057 // was from a 32-bits...
21058 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
21059 SDValue UnextendedOffset = Offset.getOperand(1);
21060
21061 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
21062 if (Signed)
21063 NewOpc = getSignExtendedGatherOpcode(NewOpc);
21064
21065 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
21066 {Chain, Pg, Base, UnextendedOffset, Ty});
21067 }
21068 }
21069
21070 return SDValue();
21071}
21072
21073/// Optimize a vector shift instruction and its operand if shifted out
21074/// bits are not used.
21076 const AArch64TargetLowering &TLI,
21078 assert(N->getOpcode() == AArch64ISD::VASHR ||
21079 N->getOpcode() == AArch64ISD::VLSHR);
21080
21081 SDValue Op = N->getOperand(0);
21082 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
21083
21084 unsigned ShiftImm = N->getConstantOperandVal(1);
21085 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
21086
21087 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
21088 if (N->getOpcode() == AArch64ISD::VASHR &&
21089 Op.getOpcode() == AArch64ISD::VSHL &&
21090 N->getOperand(1) == Op.getOperand(1))
21091 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
21092 return Op.getOperand(0);
21093
21094 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
21095 APInt DemandedMask = ~ShiftedOutBits;
21096
21097 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
21098 return SDValue(N, 0);
21099
21100 return SDValue();
21101}
21102
21104 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
21105 // This transform works in partnership with performSetCCPunpkCombine to
21106 // remove unnecessary transfer of predicates into standard registers and back
21107 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
21108 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
21109 MVT::i1) {
21110 SDValue CC = N->getOperand(0)->getOperand(0);
21111 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
21112 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
21113 DAG.getVectorIdxConstant(0, SDLoc(N)));
21114 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
21115 }
21116
21117 return SDValue();
21118}
21119
21120/// Target-specific DAG combine function for post-increment LD1 (lane) and
21121/// post-increment LD1R.
21124 bool IsLaneOp) {
21125 if (DCI.isBeforeLegalizeOps())
21126 return SDValue();
21127
21128 SelectionDAG &DAG = DCI.DAG;
21129 EVT VT = N->getValueType(0);
21130
21131 if (!VT.is128BitVector() && !VT.is64BitVector())
21132 return SDValue();
21133
21134 unsigned LoadIdx = IsLaneOp ? 1 : 0;
21135 SDNode *LD = N->getOperand(LoadIdx).getNode();
21136 // If it is not LOAD, can not do such combine.
21137 if (LD->getOpcode() != ISD::LOAD)
21138 return SDValue();
21139
21140 // The vector lane must be a constant in the LD1LANE opcode.
21141 SDValue Lane;
21142 if (IsLaneOp) {
21143 Lane = N->getOperand(2);
21144 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
21145 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
21146 return SDValue();
21147 }
21148
21149 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
21150 EVT MemVT = LoadSDN->getMemoryVT();
21151 // Check if memory operand is the same type as the vector element.
21152 if (MemVT != VT.getVectorElementType())
21153 return SDValue();
21154
21155 // Check if there are other uses. If so, do not combine as it will introduce
21156 // an extra load.
21157 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
21158 ++UI) {
21159 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
21160 continue;
21161 if (*UI != N)
21162 return SDValue();
21163 }
21164
21165 // If there is one use and it can splat the value, prefer that operation.
21166 // TODO: This could be expanded to more operations if they reliably use the
21167 // index variants.
21168 if (N->hasOneUse()) {
21169 unsigned UseOpc = N->use_begin()->getOpcode();
21170 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
21171 return SDValue();
21172 }
21173
21174 SDValue Addr = LD->getOperand(1);
21175 SDValue Vector = N->getOperand(0);
21176 // Search for a use of the address operand that is an increment.
21177 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
21178 Addr.getNode()->use_end(); UI != UE; ++UI) {
21179 SDNode *User = *UI;
21180 if (User->getOpcode() != ISD::ADD
21181 || UI.getUse().getResNo() != Addr.getResNo())
21182 continue;
21183
21184 // If the increment is a constant, it must match the memory ref size.
21185 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
21186 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
21187 uint32_t IncVal = CInc->getZExtValue();
21188 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
21189 if (IncVal != NumBytes)
21190 continue;
21191 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21192 }
21193
21194 // To avoid cycle construction make sure that neither the load nor the add
21195 // are predecessors to each other or the Vector.
21198 Visited.insert(Addr.getNode());
21199 Worklist.push_back(User);
21200 Worklist.push_back(LD);
21201 Worklist.push_back(Vector.getNode());
21202 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
21203 SDNode::hasPredecessorHelper(User, Visited, Worklist))
21204 continue;
21205
21207 Ops.push_back(LD->getOperand(0)); // Chain
21208 if (IsLaneOp) {
21209 Ops.push_back(Vector); // The vector to be inserted
21210 Ops.push_back(Lane); // The lane to be inserted in the vector
21211 }
21212 Ops.push_back(Addr);
21213 Ops.push_back(Inc);
21214
21215 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
21216 SDVTList SDTys = DAG.getVTList(Tys);
21217 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
21218 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
21219 MemVT,
21220 LoadSDN->getMemOperand());
21221
21222 // Update the uses.
21223 SDValue NewResults[] = {
21224 SDValue(LD, 0), // The result of load
21225 SDValue(UpdN.getNode(), 2) // Chain
21226 };
21227 DCI.CombineTo(LD, NewResults);
21228 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
21229 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
21230
21231 break;
21232 }
21233 return SDValue();
21234}
21235
21236/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
21237/// address translation.
21240 SelectionDAG &DAG) {
21241 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
21242 KnownBits Known;
21244 !DCI.isBeforeLegalizeOps());
21245 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21246 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
21247 DCI.CommitTargetLoweringOpt(TLO);
21248 return true;
21249 }
21250 return false;
21251}
21252
21254 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
21255 "Expected STORE dag node in input!");
21256
21257 if (auto Store = dyn_cast<StoreSDNode>(N)) {
21258 if (!Store->isTruncatingStore() || Store->isIndexed())
21259 return SDValue();
21260 SDValue Ext = Store->getValue();
21261 auto ExtOpCode = Ext.getOpcode();
21262 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
21263 ExtOpCode != ISD::ANY_EXTEND)
21264 return SDValue();
21265 SDValue Orig = Ext->getOperand(0);
21266 if (Store->getMemoryVT() != Orig.getValueType())
21267 return SDValue();
21268 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
21269 Store->getBasePtr(), Store->getMemOperand());
21270 }
21271
21272 return SDValue();
21273}
21274
21275// Perform TBI simplification if supported by the target and try to break up
21276// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
21277// load instructions can be selected.
21280 SelectionDAG &DAG,
21281 const AArch64Subtarget *Subtarget) {
21282 if (Subtarget->supportsAddressTopByteIgnored())
21283 performTBISimplification(N->getOperand(1), DCI, DAG);
21284
21285 LoadSDNode *LD = cast<LoadSDNode>(N);
21286 EVT MemVT = LD->getMemoryVT();
21287 if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
21288 return SDValue(N, 0);
21289
21290 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
21291 MemVT.getSizeInBits() % 256 == 0 ||
21292 256 % MemVT.getScalarSizeInBits() != 0)
21293 return SDValue(N, 0);
21294
21295 SDLoc DL(LD);
21296 SDValue Chain = LD->getChain();
21297 SDValue BasePtr = LD->getBasePtr();
21298 SDNodeFlags Flags = LD->getFlags();
21300 SmallVector<SDValue, 4> LoadOpsChain;
21301 // Replace any non temporal load over 256-bit with a series of 256 bit loads
21302 // and a scalar/vector load less than 256. This way we can utilize 256-bit
21303 // loads and reduce the amount of load instructions generated.
21304 MVT NewVT =
21306 256 / MemVT.getVectorElementType().getSizeInBits());
21307 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
21308 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
21309 for (unsigned I = 0; I < Num256Loads; I++) {
21310 unsigned PtrOffset = I * 32;
21311 SDValue NewPtr = DAG.getMemBasePlusOffset(
21312 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21313 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21314 SDValue NewLoad = DAG.getLoad(
21315 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
21316 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
21317 LoadOps.push_back(NewLoad);
21318 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
21319 }
21320
21321 // Process remaining bits of the load operation.
21322 // This is done by creating an UNDEF vector to match the size of the
21323 // 256-bit loads and inserting the remaining load to it. We extract the
21324 // original load type at the end using EXTRACT_SUBVECTOR instruction.
21325 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
21326 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
21327 MVT RemainingVT = MVT::getVectorVT(
21329 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
21330 SDValue NewPtr = DAG.getMemBasePlusOffset(
21331 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21332 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21333 SDValue RemainingLoad =
21334 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
21335 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
21336 LD->getMemOperand()->getFlags(), LD->getAAInfo());
21337 SDValue UndefVector = DAG.getUNDEF(NewVT);
21338 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
21339 SDValue ExtendedReminingLoad =
21340 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
21341 {UndefVector, RemainingLoad, InsertIdx});
21342 LoadOps.push_back(ExtendedReminingLoad);
21343 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
21344 EVT ConcatVT =
21346 LoadOps.size() * NewVT.getVectorNumElements());
21347 SDValue ConcatVectors =
21348 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
21349 // Extract the original vector type size.
21350 SDValue ExtractSubVector =
21351 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
21352 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
21353 SDValue TokenFactor =
21354 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
21355 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
21356}
21357
21359 EVT VecVT = Op.getValueType();
21360 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
21361 "Need boolean vector type.");
21362
21363 if (Depth > 3)
21365
21366 // We can get the base type from a vector compare or truncate.
21367 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
21368 return Op.getOperand(0).getValueType();
21369
21370 // If an operand is a bool vector, continue looking.
21372 for (SDValue Operand : Op->op_values()) {
21373 if (Operand.getValueType() != VecVT)
21374 continue;
21375
21376 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
21377 if (!BaseVT.isSimple())
21378 BaseVT = OperandVT;
21379 else if (OperandVT != BaseVT)
21381 }
21382
21383 return BaseVT;
21384}
21385
21386// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
21387// iN, we can use a trick that extracts the i^th bit from the i^th element and
21388// then performs a vector add to get a scalar bitmask. This requires that each
21389// element's bits are either all 1 or all 0.
21391 SDLoc DL(N);
21392 SDValue ComparisonResult(N, 0);
21393 EVT VecVT = ComparisonResult.getValueType();
21394 assert(VecVT.isVector() && "Must be a vector type");
21395
21396 unsigned NumElts = VecVT.getVectorNumElements();
21397 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
21398 return SDValue();
21399
21400 if (VecVT.getVectorElementType() != MVT::i1 &&
21401 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
21402 return SDValue();
21403
21404 // If we can find the original types to work on instead of a vector of i1,
21405 // we can avoid extend/extract conversion instructions.
21406 if (VecVT.getVectorElementType() == MVT::i1) {
21407 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
21408 if (!VecVT.isSimple()) {
21409 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
21410 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
21411 }
21412 }
21413 VecVT = VecVT.changeVectorElementTypeToInteger();
21414
21415 // Large vectors don't map directly to this conversion, so to avoid too many
21416 // edge cases, we don't apply it here. The conversion will likely still be
21417 // applied later via multiple smaller vectors, whose results are concatenated.
21418 if (VecVT.getSizeInBits() > 128)
21419 return SDValue();
21420
21421 // Ensure that all elements' bits are either 0s or 1s.
21422 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
21423
21424 SmallVector<SDValue, 16> MaskConstants;
21425 if (VecVT == MVT::v16i8) {
21426 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
21427 // per entry. We split it into two halves, apply the mask, zip the halves to
21428 // create 8x 16-bit values, and the perform the vector reduce.
21429 for (unsigned Half = 0; Half < 2; ++Half) {
21430 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
21431 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
21432 }
21433 }
21434 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
21435 SDValue RepresentativeBits =
21436 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
21437
21438 SDValue UpperRepresentativeBits =
21439 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
21440 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
21441 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
21442 RepresentativeBits, UpperRepresentativeBits);
21443 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
21444 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
21445 }
21446
21447 // All other vector sizes.
21448 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
21449 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
21450 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
21451 }
21452
21453 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
21454 SDValue RepresentativeBits =
21455 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
21456 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
21457 NumElts, VecVT.getVectorElementType().getSizeInBits()));
21458 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
21459}
21460
21462 StoreSDNode *Store) {
21463 if (!Store->isTruncatingStore())
21464 return SDValue();
21465
21466 SDLoc DL(Store);
21467 SDValue VecOp = Store->getValue();
21468 EVT VT = VecOp.getValueType();
21469 EVT MemVT = Store->getMemoryVT();
21470
21471 if (!MemVT.isVector() || !VT.isVector() ||
21472 MemVT.getVectorElementType() != MVT::i1)
21473 return SDValue();
21474
21475 // If we are storing a vector that we are currently building, let
21476 // `scalarizeVectorStore()` handle this more efficiently.
21477 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
21478 return SDValue();
21479
21480 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
21481 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
21482 if (!VectorBits)
21483 return SDValue();
21484
21485 EVT StoreVT =
21487 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
21488 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
21489 Store->getMemOperand());
21490}
21491
21493 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
21494 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
21495 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
21496}
21497
21500 SelectionDAG &DAG,
21501 const AArch64Subtarget *Subtarget) {
21502 StoreSDNode *ST = cast<StoreSDNode>(N);
21503 SDValue Chain = ST->getChain();
21504 SDValue Value = ST->getValue();
21505 SDValue Ptr = ST->getBasePtr();
21506 EVT ValueVT = Value.getValueType();
21507
21508 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
21509 EVT EltVT = VT.getVectorElementType();
21510 return EltVT == MVT::f32 || EltVT == MVT::f64;
21511 };
21512
21513 // If this is an FP_ROUND followed by a store, fold this into a truncating
21514 // store. We can do this even if this is already a truncstore.
21515 // We purposefully don't care about legality of the nodes here as we know
21516 // they can be split down into something legal.
21517 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
21518 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
21519 Subtarget->useSVEForFixedLengthVectors() &&
21520 ValueVT.isFixedLengthVector() &&
21521 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
21522 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
21523 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
21524 ST->getMemoryVT(), ST->getMemOperand());
21525
21526 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
21527 return Split;
21528
21529 if (Subtarget->supportsAddressTopByteIgnored() &&
21530 performTBISimplification(N->getOperand(2), DCI, DAG))
21531 return SDValue(N, 0);
21532
21533 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
21534 return Store;
21535
21536 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
21537 return Store;
21538
21539 if (ST->isTruncatingStore()) {
21540 EVT StoreVT = ST->getMemoryVT();
21541 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
21542 return SDValue();
21543 if (SDValue Rshrnb =
21544 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
21545 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
21546 StoreVT, ST->getMemOperand());
21547 }
21548 }
21549
21550 return SDValue();
21551}
21552
21555 SelectionDAG &DAG,
21556 const AArch64Subtarget *Subtarget) {
21557 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
21558 SDValue Value = MST->getValue();
21559 SDValue Mask = MST->getMask();
21560 SDLoc DL(N);
21561
21562 // If this is a UZP1 followed by a masked store, fold this into a masked
21563 // truncating store. We can do this even if this is already a masked
21564 // truncstore.
21565 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
21566 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
21567 Value.getValueType().isInteger()) {
21568 Value = Value.getOperand(0);
21569 if (Value.getOpcode() == ISD::BITCAST) {
21570 EVT HalfVT =
21571 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
21572 EVT InVT = Value.getOperand(0).getValueType();
21573
21574 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
21575 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21576 unsigned PgPattern = Mask->getConstantOperandVal(0);
21577
21578 // Ensure we can double the size of the predicate pattern
21579 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
21580 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
21581 MinSVESize) {
21582 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
21583 PgPattern);
21584 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
21585 MST->getBasePtr(), MST->getOffset(), Mask,
21586 MST->getMemoryVT(), MST->getMemOperand(),
21587 MST->getAddressingMode(),
21588 /*IsTruncating=*/true);
21589 }
21590 }
21591 }
21592 }
21593
21594 if (MST->isTruncatingStore()) {
21595 EVT ValueVT = Value->getValueType(0);
21596 EVT MemVT = MST->getMemoryVT();
21597 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
21598 return SDValue();
21599 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
21600 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
21601 MST->getOffset(), MST->getMask(),
21602 MST->getMemoryVT(), MST->getMemOperand(),
21603 MST->getAddressingMode(), true);
21604 }
21605 }
21606
21607 return SDValue();
21608}
21609
21610/// \return true if part of the index was folded into the Base.
21611static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
21612 SDLoc DL, SelectionDAG &DAG) {
21613 // This function assumes a vector of i64 indices.
21614 EVT IndexVT = Index.getValueType();
21615 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
21616 return false;
21617
21618 // Simplify:
21619 // BasePtr = Ptr
21620 // Index = X + splat(Offset)
21621 // ->
21622 // BasePtr = Ptr + Offset * scale.
21623 // Index = X
21624 if (Index.getOpcode() == ISD::ADD) {
21625 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
21626 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
21627 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
21628 Index = Index.getOperand(0);
21629 return true;
21630 }
21631 }
21632
21633 // Simplify:
21634 // BasePtr = Ptr
21635 // Index = (X + splat(Offset)) << splat(Shift)
21636 // ->
21637 // BasePtr = Ptr + (Offset << Shift) * scale)
21638 // Index = X << splat(shift)
21639 if (Index.getOpcode() == ISD::SHL &&
21640 Index.getOperand(0).getOpcode() == ISD::ADD) {
21641 SDValue Add = Index.getOperand(0);
21642 SDValue ShiftOp = Index.getOperand(1);
21643 SDValue OffsetOp = Add.getOperand(1);
21644 if (auto Shift = DAG.getSplatValue(ShiftOp))
21645 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
21646 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
21647 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
21648 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
21649 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
21650 Add.getOperand(0), ShiftOp);
21651 return true;
21652 }
21653 }
21654
21655 return false;
21656}
21657
21658// Analyse the specified address returning true if a more optimal addressing
21659// mode is available. When returning true all parameters are updated to reflect
21660// their recommended values.
21662 SDValue &BasePtr, SDValue &Index,
21663 SelectionDAG &DAG) {
21664 // Try to iteratively fold parts of the index into the base pointer to
21665 // simplify the index as much as possible.
21666 bool Changed = false;
21667 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
21668 Changed = true;
21669
21670 // Only consider element types that are pointer sized as smaller types can
21671 // be easily promoted.
21672 EVT IndexVT = Index.getValueType();
21673 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
21674 return Changed;
21675
21676 // Can indices be trivially shrunk?
21677 EVT DataVT = N->getOperand(1).getValueType();
21678 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
21679 // will later be re-extended to 64 bits in legalization
21680 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
21681 return Changed;
21682 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
21683 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
21684 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
21685 return true;
21686 }
21687
21688 // Match:
21689 // Index = step(const)
21690 int64_t Stride = 0;
21691 if (Index.getOpcode() == ISD::STEP_VECTOR) {
21692 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
21693 }
21694 // Match:
21695 // Index = step(const) << shift(const)
21696 else if (Index.getOpcode() == ISD::SHL &&
21697 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
21698 SDValue RHS = Index.getOperand(1);
21699 if (auto *Shift =
21700 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
21701 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
21702 Stride = Step << Shift->getZExtValue();
21703 }
21704 }
21705
21706 // Return early because no supported pattern is found.
21707 if (Stride == 0)
21708 return Changed;
21709
21710 if (Stride < std::numeric_limits<int32_t>::min() ||
21711 Stride > std::numeric_limits<int32_t>::max())
21712 return Changed;
21713
21714 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
21715 unsigned MaxVScale =
21717 int64_t LastElementOffset =
21718 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
21719
21720 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
21721 LastElementOffset > std::numeric_limits<int32_t>::max())
21722 return Changed;
21723
21724 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
21725 // Stride does not scale explicitly by 'Scale', because it happens in
21726 // the gather/scatter addressing mode.
21727 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
21728 return true;
21729}
21730
21733 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
21734 assert(MGS && "Can only combine gather load or scatter store nodes");
21735
21736 if (!DCI.isBeforeLegalize())
21737 return SDValue();
21738
21739 SDLoc DL(MGS);
21740 SDValue Chain = MGS->getChain();
21741 SDValue Scale = MGS->getScale();
21742 SDValue Index = MGS->getIndex();
21743 SDValue Mask = MGS->getMask();
21744 SDValue BasePtr = MGS->getBasePtr();
21745 ISD::MemIndexType IndexType = MGS->getIndexType();
21746
21747 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
21748 return SDValue();
21749
21750 // Here we catch such cases early and change MGATHER's IndexType to allow
21751 // the use of an Index that's more legalisation friendly.
21752 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
21753 SDValue PassThru = MGT->getPassThru();
21754 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
21755 return DAG.getMaskedGather(
21756 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
21757 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
21758 }
21759 auto *MSC = cast<MaskedScatterSDNode>(MGS);
21760 SDValue Data = MSC->getValue();
21761 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
21762 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
21763 Ops, MSC->getMemOperand(), IndexType,
21764 MSC->isTruncatingStore());
21765}
21766
21767/// Target-specific DAG combine function for NEON load/store intrinsics
21768/// to merge base address updates.
21771 SelectionDAG &DAG) {
21772 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
21773 return SDValue();
21774
21775 unsigned AddrOpIdx = N->getNumOperands() - 1;
21776 SDValue Addr = N->getOperand(AddrOpIdx);
21777
21778 // Search for a use of the address operand that is an increment.
21779 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
21780 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
21781 SDNode *User = *UI;
21782 if (User->getOpcode() != ISD::ADD ||
21783 UI.getUse().getResNo() != Addr.getResNo())
21784 continue;
21785
21786 // Check that the add is independent of the load/store. Otherwise, folding
21787 // it would create a cycle.
21790 Visited.insert(Addr.getNode());
21791 Worklist.push_back(N);
21792 Worklist.push_back(User);
21793 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
21794 SDNode::hasPredecessorHelper(User, Visited, Worklist))
21795 continue;
21796
21797 // Find the new opcode for the updating load/store.
21798 bool IsStore = false;
21799 bool IsLaneOp = false;
21800 bool IsDupOp = false;
21801 unsigned NewOpc = 0;
21802 unsigned NumVecs = 0;
21803 unsigned IntNo = N->getConstantOperandVal(1);
21804 switch (IntNo) {
21805 default: llvm_unreachable("unexpected intrinsic for Neon base update");
21806 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
21807 NumVecs = 2; break;
21808 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
21809 NumVecs = 3; break;
21810 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
21811 NumVecs = 4; break;
21812 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
21813 NumVecs = 2; IsStore = true; break;
21814 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
21815 NumVecs = 3; IsStore = true; break;
21816 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
21817 NumVecs = 4; IsStore = true; break;
21818 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
21819 NumVecs = 2; break;
21820 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
21821 NumVecs = 3; break;
21822 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
21823 NumVecs = 4; break;
21824 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
21825 NumVecs = 2; IsStore = true; break;
21826 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
21827 NumVecs = 3; IsStore = true; break;
21828 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
21829 NumVecs = 4; IsStore = true; break;
21830 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
21831 NumVecs = 2; IsDupOp = true; break;
21832 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
21833 NumVecs = 3; IsDupOp = true; break;
21834 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
21835 NumVecs = 4; IsDupOp = true; break;
21836 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
21837 NumVecs = 2; IsLaneOp = true; break;
21838 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
21839 NumVecs = 3; IsLaneOp = true; break;
21840 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
21841 NumVecs = 4; IsLaneOp = true; break;
21842 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
21843 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
21844 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
21845 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
21846 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
21847 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
21848 }
21849
21850 EVT VecTy;
21851 if (IsStore)
21852 VecTy = N->getOperand(2).getValueType();
21853 else
21854 VecTy = N->getValueType(0);
21855
21856 // If the increment is a constant, it must match the memory ref size.
21857 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
21858 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
21859 uint32_t IncVal = CInc->getZExtValue();
21860 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
21861 if (IsLaneOp || IsDupOp)
21862 NumBytes /= VecTy.getVectorNumElements();
21863 if (IncVal != NumBytes)
21864 continue;
21865 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21866 }
21868 Ops.push_back(N->getOperand(0)); // Incoming chain
21869 // Load lane and store have vector list as input.
21870 if (IsLaneOp || IsStore)
21871 for (unsigned i = 2; i < AddrOpIdx; ++i)
21872 Ops.push_back(N->getOperand(i));
21873 Ops.push_back(Addr); // Base register
21874 Ops.push_back(Inc);
21875
21876 // Return Types.
21877 EVT Tys[6];
21878 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
21879 unsigned n;
21880 for (n = 0; n < NumResultVecs; ++n)
21881 Tys[n] = VecTy;
21882 Tys[n++] = MVT::i64; // Type of write back register
21883 Tys[n] = MVT::Other; // Type of the chain
21884 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
21885
21886 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
21887 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
21888 MemInt->getMemoryVT(),
21889 MemInt->getMemOperand());
21890
21891 // Update the uses.
21892 std::vector<SDValue> NewResults;
21893 for (unsigned i = 0; i < NumResultVecs; ++i) {
21894 NewResults.push_back(SDValue(UpdN.getNode(), i));
21895 }
21896 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
21897 DCI.CombineTo(N, NewResults);
21898 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
21899
21900 break;
21901 }
21902 return SDValue();
21903}
21904
21905// Checks to see if the value is the prescribed width and returns information
21906// about its extension mode.
21907static
21908bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
21909 ExtType = ISD::NON_EXTLOAD;
21910 switch(V.getNode()->getOpcode()) {
21911 default:
21912 return false;
21913 case ISD::LOAD: {
21914 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
21915 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
21916 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
21917 ExtType = LoadNode->getExtensionType();
21918 return true;
21919 }
21920 return false;
21921 }
21922 case ISD::AssertSext: {
21923 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
21924 if ((TypeNode->getVT() == MVT::i8 && width == 8)
21925 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
21926 ExtType = ISD::SEXTLOAD;
21927 return true;
21928 }
21929 return false;
21930 }
21931 case ISD::AssertZext: {
21932 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
21933 if ((TypeNode->getVT() == MVT::i8 && width == 8)
21934 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
21935 ExtType = ISD::ZEXTLOAD;
21936 return true;
21937 }
21938 return false;
21939 }
21940 case ISD::Constant:
21941 case ISD::TargetConstant: {
21942 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
21943 1LL << (width - 1);
21944 }
21945 }
21946
21947 return true;
21948}
21949
21950// This function does a whole lot of voodoo to determine if the tests are
21951// equivalent without and with a mask. Essentially what happens is that given a
21952// DAG resembling:
21953//
21954// +-------------+ +-------------+ +-------------+ +-------------+
21955// | Input | | AddConstant | | CompConstant| | CC |
21956// +-------------+ +-------------+ +-------------+ +-------------+
21957// | | | |
21958// V V | +----------+
21959// +-------------+ +----+ | |
21960// | ADD | |0xff| | |
21961// +-------------+ +----+ | |
21962// | | | |
21963// V V | |
21964// +-------------+ | |
21965// | AND | | |
21966// +-------------+ | |
21967// | | |
21968// +-----+ | |
21969// | | |
21970// V V V
21971// +-------------+
21972// | CMP |
21973// +-------------+
21974//
21975// The AND node may be safely removed for some combinations of inputs. In
21976// particular we need to take into account the extension type of the Input,
21977// the exact values of AddConstant, CompConstant, and CC, along with the nominal
21978// width of the input (this can work for any width inputs, the above graph is
21979// specific to 8 bits.
21980//
21981// The specific equations were worked out by generating output tables for each
21982// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
21983// problem was simplified by working with 4 bit inputs, which means we only
21984// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
21985// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
21986// patterns present in both extensions (0,7). For every distinct set of
21987// AddConstant and CompConstants bit patterns we can consider the masked and
21988// unmasked versions to be equivalent if the result of this function is true for
21989// all 16 distinct bit patterns of for the current extension type of Input (w0).
21990//
21991// sub w8, w0, w1
21992// and w10, w8, #0x0f
21993// cmp w8, w2
21994// cset w9, AArch64CC
21995// cmp w10, w2
21996// cset w11, AArch64CC
21997// cmp w9, w11
21998// cset w0, eq
21999// ret
22000//
22001// Since the above function shows when the outputs are equivalent it defines
22002// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
22003// would be expensive to run during compiles. The equations below were written
22004// in a test harness that confirmed they gave equivalent outputs to the above
22005// for all inputs function, so they can be used determine if the removal is
22006// legal instead.
22007//
22008// isEquivalentMaskless() is the code for testing if the AND can be removed
22009// factored out of the DAG recognition as the DAG can take several forms.
22010
22011static bool isEquivalentMaskless(unsigned CC, unsigned width,
22012 ISD::LoadExtType ExtType, int AddConstant,
22013 int CompConstant) {
22014 // By being careful about our equations and only writing the in term
22015 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
22016 // make them generally applicable to all bit widths.
22017 int MaxUInt = (1 << width);
22018
22019 // For the purposes of these comparisons sign extending the type is
22020 // equivalent to zero extending the add and displacing it by half the integer
22021 // width. Provided we are careful and make sure our equations are valid over
22022 // the whole range we can just adjust the input and avoid writing equations
22023 // for sign extended inputs.
22024 if (ExtType == ISD::SEXTLOAD)
22025 AddConstant -= (1 << (width-1));
22026
22027 switch(CC) {
22028 case AArch64CC::LE:
22029 case AArch64CC::GT:
22030 if ((AddConstant == 0) ||
22031 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
22032 (AddConstant >= 0 && CompConstant < 0) ||
22033 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
22034 return true;
22035 break;
22036 case AArch64CC::LT:
22037 case AArch64CC::GE:
22038 if ((AddConstant == 0) ||
22039 (AddConstant >= 0 && CompConstant <= 0) ||
22040 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
22041 return true;
22042 break;
22043 case AArch64CC::HI:
22044 case AArch64CC::LS:
22045 if ((AddConstant >= 0 && CompConstant < 0) ||
22046 (AddConstant <= 0 && CompConstant >= -1 &&
22047 CompConstant < AddConstant + MaxUInt))
22048 return true;
22049 break;
22050 case AArch64CC::PL:
22051 case AArch64CC::MI:
22052 if ((AddConstant == 0) ||
22053 (AddConstant > 0 && CompConstant <= 0) ||
22054 (AddConstant < 0 && CompConstant <= AddConstant))
22055 return true;
22056 break;
22057 case AArch64CC::LO:
22058 case AArch64CC::HS:
22059 if ((AddConstant >= 0 && CompConstant <= 0) ||
22060 (AddConstant <= 0 && CompConstant >= 0 &&
22061 CompConstant <= AddConstant + MaxUInt))
22062 return true;
22063 break;
22064 case AArch64CC::EQ:
22065 case AArch64CC::NE:
22066 if ((AddConstant > 0 && CompConstant < 0) ||
22067 (AddConstant < 0 && CompConstant >= 0 &&
22068 CompConstant < AddConstant + MaxUInt) ||
22069 (AddConstant >= 0 && CompConstant >= 0 &&
22070 CompConstant >= AddConstant) ||
22071 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
22072 return true;
22073 break;
22074 case AArch64CC::VS:
22075 case AArch64CC::VC:
22076 case AArch64CC::AL:
22077 case AArch64CC::NV:
22078 return true;
22079 case AArch64CC::Invalid:
22080 break;
22081 }
22082
22083 return false;
22084}
22085
22086// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
22087// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
22089 SDNode *AndNode, SelectionDAG &DAG,
22090 unsigned CCIndex, unsigned CmpIndex,
22091 unsigned CC) {
22092 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
22093 if (!SubsC)
22094 return SDValue();
22095
22096 APInt SubsAP = SubsC->getAPIntValue();
22097 if (CC == AArch64CC::HI) {
22098 if (!SubsAP.isMask())
22099 return SDValue();
22100 } else if (CC == AArch64CC::LO) {
22101 if (!SubsAP.isPowerOf2())
22102 return SDValue();
22103 } else
22104 return SDValue();
22105
22106 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
22107 if (!AndC)
22108 return SDValue();
22109
22110 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
22111
22112 SDLoc DL(N);
22113 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
22114 SDValue ANDS = DAG.getNode(
22115 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
22116 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
22117 SDValue AArch64_CC =
22119 N->getOperand(CCIndex)->getValueType(0));
22120
22121 // For now, only performCSELCombine and performBRCONDCombine call this
22122 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
22123 // operands. So just init the ops direct to simplify the code. If we have some
22124 // other case with different CCIndex, CmpIndex, we need to use for loop to
22125 // rewrite the code here.
22126 // TODO: Do we need to assert number of operand is 4 here?
22127 assert((CCIndex == 2 && CmpIndex == 3) &&
22128 "Expected CCIndex to be 2 and CmpIndex to be 3.");
22129 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
22130 ANDS.getValue(1)};
22131 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
22132}
22133
22134static
22137 SelectionDAG &DAG, unsigned CCIndex,
22138 unsigned CmpIndex) {
22139 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
22140 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
22141 unsigned CondOpcode = SubsNode->getOpcode();
22142
22143 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0))
22144 return SDValue();
22145
22146 // There is a SUBS feeding this condition. Is it fed by a mask we can
22147 // use?
22148
22149 SDNode *AndNode = SubsNode->getOperand(0).getNode();
22150 unsigned MaskBits = 0;
22151
22152 if (AndNode->getOpcode() != ISD::AND)
22153 return SDValue();
22154
22155 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
22156 CmpIndex, CC))
22157 return Val;
22158
22159 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
22160 uint32_t CNV = CN->getZExtValue();
22161 if (CNV == 255)
22162 MaskBits = 8;
22163 else if (CNV == 65535)
22164 MaskBits = 16;
22165 }
22166
22167 if (!MaskBits)
22168 return SDValue();
22169
22170 SDValue AddValue = AndNode->getOperand(0);
22171
22172 if (AddValue.getOpcode() != ISD::ADD)
22173 return SDValue();
22174
22175 // The basic dag structure is correct, grab the inputs and validate them.
22176
22177 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
22178 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
22179 SDValue SubsInputValue = SubsNode->getOperand(1);
22180
22181 // The mask is present and the provenance of all the values is a smaller type,
22182 // lets see if the mask is superfluous.
22183
22184 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
22185 !isa<ConstantSDNode>(SubsInputValue.getNode()))
22186 return SDValue();
22187
22188 ISD::LoadExtType ExtType;
22189
22190 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
22191 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
22192 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
22193 return SDValue();
22194
22195 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
22196 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
22197 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
22198 return SDValue();
22199
22200 // The AND is not necessary, remove it.
22201
22202 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
22203 SubsNode->getValueType(1));
22204 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
22205
22206 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
22207 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
22208
22209 return SDValue(N, 0);
22210}
22211
22212// Optimize compare with zero and branch.
22215 SelectionDAG &DAG) {
22217 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
22218 // will not be produced, as they are conditional branch instructions that do
22219 // not set flags.
22220 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
22221 return SDValue();
22222
22223 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
22224 N = NV.getNode();
22225 SDValue Chain = N->getOperand(0);
22226 SDValue Dest = N->getOperand(1);
22227 SDValue CCVal = N->getOperand(2);
22228 SDValue Cmp = N->getOperand(3);
22229
22230 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
22231 unsigned CC = CCVal->getAsZExtVal();
22232 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
22233 return SDValue();
22234
22235 unsigned CmpOpc = Cmp.getOpcode();
22236 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
22237 return SDValue();
22238
22239 // Only attempt folding if there is only one use of the flag and no use of the
22240 // value.
22241 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
22242 return SDValue();
22243
22244 SDValue LHS = Cmp.getOperand(0);
22245 SDValue RHS = Cmp.getOperand(1);
22246
22247 assert(LHS.getValueType() == RHS.getValueType() &&
22248 "Expected the value type to be the same for both operands!");
22249 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
22250 return SDValue();
22251
22252 if (isNullConstant(LHS))
22253 std::swap(LHS, RHS);
22254
22255 if (!isNullConstant(RHS))
22256 return SDValue();
22257
22258 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
22259 LHS.getOpcode() == ISD::SRL)
22260 return SDValue();
22261
22262 // Fold the compare into the branch instruction.
22263 SDValue BR;
22264 if (CC == AArch64CC::EQ)
22265 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22266 else
22267 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22268
22269 // Do not add new nodes to DAG combiner worklist.
22270 DCI.CombineTo(N, BR, false);
22271
22272 return SDValue();
22273}
22274
22276 unsigned CC = N->getConstantOperandVal(2);
22277 SDValue SUBS = N->getOperand(3);
22278 SDValue Zero, CTTZ;
22279
22280 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
22281 Zero = N->getOperand(0);
22282 CTTZ = N->getOperand(1);
22283 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
22284 Zero = N->getOperand(1);
22285 CTTZ = N->getOperand(0);
22286 } else
22287 return SDValue();
22288
22289 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
22290 (CTTZ.getOpcode() == ISD::TRUNCATE &&
22291 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
22292 return SDValue();
22293
22294 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
22295 "Illegal type in CTTZ folding");
22296
22297 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
22298 return SDValue();
22299
22300 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
22301 ? CTTZ.getOperand(0).getOperand(0)
22302 : CTTZ.getOperand(0);
22303
22304 if (X != SUBS.getOperand(0))
22305 return SDValue();
22306
22307 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
22308 ? CTTZ.getOperand(0).getValueSizeInBits()
22309 : CTTZ.getValueSizeInBits();
22310 SDValue BitWidthMinusOne =
22311 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
22312 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
22313 BitWidthMinusOne);
22314}
22315
22316// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
22317// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
22318// Where x and y are constants and x != y
22319
22320// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
22321// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
22322// Where x and y are constants and x != y
22324 SDValue L = Op->getOperand(0);
22325 SDValue R = Op->getOperand(1);
22326 AArch64CC::CondCode OpCC =
22327 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
22328
22329 SDValue OpCmp = Op->getOperand(3);
22330 if (!isCMP(OpCmp))
22331 return SDValue();
22332
22333 SDValue CmpLHS = OpCmp.getOperand(0);
22334 SDValue CmpRHS = OpCmp.getOperand(1);
22335
22336 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
22337 std::swap(CmpLHS, CmpRHS);
22338 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
22339 return SDValue();
22340
22341 SDValue X = CmpLHS->getOperand(0);
22342 SDValue Y = CmpLHS->getOperand(1);
22343 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
22344 return SDValue();
22345 }
22346
22347 // If one of the constant is opaque constant, x,y sdnode is still different
22348 // but the real value maybe the same. So check APInt here to make sure the
22349 // code is correct.
22350 ConstantSDNode *CX = cast<ConstantSDNode>(X);
22351 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
22352 if (CX->getAPIntValue() == CY->getAPIntValue())
22353 return SDValue();
22354
22356 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
22357 SDValue Cond = CmpLHS->getOperand(3);
22358
22359 if (CmpRHS == Y)
22361 else if (CmpRHS != X)
22362 return SDValue();
22363
22364 if (OpCC == AArch64CC::NE)
22366 else if (OpCC != AArch64CC::EQ)
22367 return SDValue();
22368
22369 SDLoc DL(Op);
22370 EVT VT = Op->getValueType(0);
22371
22372 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
22373 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
22374}
22375
22376// Optimize CSEL instructions
22379 SelectionDAG &DAG) {
22380 // CSEL x, x, cc -> x
22381 if (N->getOperand(0) == N->getOperand(1))
22382 return N->getOperand(0);
22383
22384 if (SDValue R = foldCSELOfCSEL(N, DAG))
22385 return R;
22386
22387 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
22388 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
22389 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
22390 return Folded;
22391
22392 return performCONDCombine(N, DCI, DAG, 2, 3);
22393}
22394
22395// Try to re-use an already extended operand of a vector SetCC feeding a
22396// extended select. Doing so avoids requiring another full extension of the
22397// SET_CC result when lowering the select.
22399 EVT Op0MVT = Op->getOperand(0).getValueType();
22400 if (!Op0MVT.isVector() || Op->use_empty())
22401 return SDValue();
22402
22403 // Make sure that all uses of Op are VSELECTs with result matching types where
22404 // the result type has a larger element type than the SetCC operand.
22405 SDNode *FirstUse = *Op->use_begin();
22406 if (FirstUse->getOpcode() != ISD::VSELECT)
22407 return SDValue();
22408 EVT UseMVT = FirstUse->getValueType(0);
22409 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
22410 return SDValue();
22411 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
22412 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
22413 }))
22414 return SDValue();
22415
22416 APInt V;
22417 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
22418 return SDValue();
22419
22420 SDLoc DL(Op);
22421 SDValue Op0ExtV;
22422 SDValue Op1ExtV;
22423 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
22424 // Check if the first operand of the SET_CC is already extended. If it is,
22425 // split the SET_CC and re-use the extended version of the operand.
22426 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
22427 Op->getOperand(0));
22428 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
22429 Op->getOperand(0));
22430 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
22431 Op0ExtV = SDValue(Op0SExt, 0);
22432 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
22433 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
22434 Op0ExtV = SDValue(Op0ZExt, 0);
22435 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
22436 } else
22437 return SDValue();
22438
22439 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
22440 Op0ExtV, Op1ExtV, Op->getOperand(2));
22441}
22442
22443static SDValue
22445 SelectionDAG &DAG) {
22446 SDValue Vec = N->getOperand(0);
22447 if (DCI.isBeforeLegalize() &&
22448 Vec.getValueType().getVectorElementType() == MVT::i1 &&
22451 SDLoc DL(N);
22452 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
22453 DAG);
22454 }
22455
22456 return SDValue();
22457}
22458
22461 SelectionDAG &DAG) {
22462 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
22463 SDValue LHS = N->getOperand(0);
22464 SDValue RHS = N->getOperand(1);
22465 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
22466 SDLoc DL(N);
22467 EVT VT = N->getValueType(0);
22468
22469 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
22470 return V;
22471
22472 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
22473 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
22474 LHS->getOpcode() == AArch64ISD::CSEL &&
22475 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
22476 LHS->hasOneUse()) {
22477 // Invert CSEL's condition.
22478 auto OldCond =
22479 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
22480 auto NewCond = getInvertedCondCode(OldCond);
22481
22482 // csel 0, 1, !cond, X
22483 SDValue CSEL =
22484 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
22485 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
22486 LHS.getOperand(3));
22487 return DAG.getZExtOrTrunc(CSEL, DL, VT);
22488 }
22489
22490 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
22491 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
22492 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
22493 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
22494 LHS->hasOneUse()) {
22495 EVT TstVT = LHS->getValueType(0);
22496 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
22497 // this pattern will get better opt in emitComparison
22498 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
22499 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
22500 DAG.getConstant(TstImm, DL, TstVT));
22501 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
22502 }
22503 }
22504
22505 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
22506 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
22507 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
22508 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
22509 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
22510 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
22512 LHS->getOpcode() == ISD::BITCAST) {
22513 EVT ToVT = LHS->getValueType(0);
22514 EVT FromVT = LHS->getOperand(0).getValueType();
22515 if (FromVT.isFixedLengthVector() &&
22516 FromVT.getVectorElementType() == MVT::i1) {
22517 bool IsNull = isNullConstant(RHS);
22519 DL, MVT::i1, LHS->getOperand(0));
22520 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
22521 LHS);
22522 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
22523 }
22524 }
22525
22526 // Try to perform the memcmp when the result is tested for [in]equality with 0
22527 if (SDValue V = performOrXorChainCombine(N, DAG))
22528 return V;
22529
22530 return SDValue();
22531}
22532
22533// Replace a flag-setting operator (eg ANDS) with the generic version
22534// (eg AND) if the flag is unused.
22537 unsigned GenericOpcode) {
22538 SDLoc DL(N);
22539 SDValue LHS = N->getOperand(0);
22540 SDValue RHS = N->getOperand(1);
22541 EVT VT = N->getValueType(0);
22542
22543 // If the flag result isn't used, convert back to a generic opcode.
22544 if (!N->hasAnyUseOfValue(1)) {
22545 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
22546 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
22547 DL);
22548 }
22549
22550 // Combine identical generic nodes into this node, re-using the result.
22551 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
22552 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
22553 DCI.CombineTo(Generic, SDValue(N, 0));
22554
22555 return SDValue();
22556}
22557
22559 // setcc_merge_zero pred
22560 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
22561 // => extract_subvector (inner setcc_merge_zero)
22562 SDValue Pred = N->getOperand(0);
22563 SDValue LHS = N->getOperand(1);
22564 SDValue RHS = N->getOperand(2);
22565 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
22566
22567 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
22568 LHS->getOpcode() != ISD::SIGN_EXTEND)
22569 return SDValue();
22570
22571 SDValue Extract = LHS->getOperand(0);
22572 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
22573 Extract->getValueType(0) != N->getValueType(0) ||
22574 Extract->getConstantOperandVal(1) != 0)
22575 return SDValue();
22576
22577 SDValue InnerSetCC = Extract->getOperand(0);
22578 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
22579 return SDValue();
22580
22581 // By this point we've effectively got
22582 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
22583 // lanes are already zero then the trunc(sext()) sequence is redundant and we
22584 // can operate on A directly.
22585 SDValue InnerPred = InnerSetCC.getOperand(0);
22586 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
22587 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
22588 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
22589 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
22590 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
22591 return Extract;
22592
22593 return SDValue();
22594}
22595
22596static SDValue
22598 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
22599 "Unexpected opcode!");
22600
22601 SelectionDAG &DAG = DCI.DAG;
22602 SDValue Pred = N->getOperand(0);
22603 SDValue LHS = N->getOperand(1);
22604 SDValue RHS = N->getOperand(2);
22605 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
22606
22607 if (SDValue V = performSetCCPunpkCombine(N, DAG))
22608 return V;
22609
22610 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
22611 LHS->getOpcode() == ISD::SIGN_EXTEND &&
22612 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
22613 // setcc_merge_zero(
22614 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
22615 // => setcc_merge_zero(pred, ...)
22616 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
22617 LHS->getOperand(0)->getOperand(0) == Pred)
22618 return LHS->getOperand(0);
22619
22620 // setcc_merge_zero(
22621 // all_active, extend(nxvNi1 ...), != splat(0))
22622 // -> nxvNi1 ...
22623 if (isAllActivePredicate(DAG, Pred))
22624 return LHS->getOperand(0);
22625
22626 // setcc_merge_zero(
22627 // pred, extend(nxvNi1 ...), != splat(0))
22628 // -> nxvNi1 and(pred, ...)
22629 if (DCI.isAfterLegalizeDAG())
22630 // Do this after legalization to allow more folds on setcc_merge_zero
22631 // to be recognized.
22632 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
22633 LHS->getOperand(0), Pred);
22634 }
22635
22636 return SDValue();
22637}
22638
22639// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
22640// as well as whether the test should be inverted. This code is required to
22641// catch these cases (as opposed to standard dag combines) because
22642// AArch64ISD::TBZ is matched during legalization.
22643static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
22644 SelectionDAG &DAG) {
22645
22646 if (!Op->hasOneUse())
22647 return Op;
22648
22649 // We don't handle undef/constant-fold cases below, as they should have
22650 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
22651 // etc.)
22652
22653 // (tbz (trunc x), b) -> (tbz x, b)
22654 // This case is just here to enable more of the below cases to be caught.
22655 if (Op->getOpcode() == ISD::TRUNCATE &&
22656 Bit < Op->getValueType(0).getSizeInBits()) {
22657 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22658 }
22659
22660 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
22661 if (Op->getOpcode() == ISD::ANY_EXTEND &&
22662 Bit < Op->getOperand(0).getValueSizeInBits()) {
22663 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22664 }
22665
22666 if (Op->getNumOperands() != 2)
22667 return Op;
22668
22669 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
22670 if (!C)
22671 return Op;
22672
22673 switch (Op->getOpcode()) {
22674 default:
22675 return Op;
22676
22677 // (tbz (and x, m), b) -> (tbz x, b)
22678 case ISD::AND:
22679 if ((C->getZExtValue() >> Bit) & 1)
22680 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22681 return Op;
22682
22683 // (tbz (shl x, c), b) -> (tbz x, b-c)
22684 case ISD::SHL:
22685 if (C->getZExtValue() <= Bit &&
22686 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
22687 Bit = Bit - C->getZExtValue();
22688 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22689 }
22690 return Op;
22691
22692 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
22693 case ISD::SRA:
22694 Bit = Bit + C->getZExtValue();
22695 if (Bit >= Op->getValueType(0).getSizeInBits())
22696 Bit = Op->getValueType(0).getSizeInBits() - 1;
22697 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22698
22699 // (tbz (srl x, c), b) -> (tbz x, b+c)
22700 case ISD::SRL:
22701 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
22702 Bit = Bit + C->getZExtValue();
22703 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22704 }
22705 return Op;
22706
22707 // (tbz (xor x, -1), b) -> (tbnz x, b)
22708 case ISD::XOR:
22709 if ((C->getZExtValue() >> Bit) & 1)
22710 Invert = !Invert;
22711 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22712 }
22713}
22714
22715// Optimize test single bit zero/non-zero and branch.
22718 SelectionDAG &DAG) {
22719 unsigned Bit = N->getConstantOperandVal(2);
22720 bool Invert = false;
22721 SDValue TestSrc = N->getOperand(1);
22722 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
22723
22724 if (TestSrc == NewTestSrc)
22725 return SDValue();
22726
22727 unsigned NewOpc = N->getOpcode();
22728 if (Invert) {
22729 if (NewOpc == AArch64ISD::TBZ)
22730 NewOpc = AArch64ISD::TBNZ;
22731 else {
22732 assert(NewOpc == AArch64ISD::TBNZ);
22733 NewOpc = AArch64ISD::TBZ;
22734 }
22735 }
22736
22737 SDLoc DL(N);
22738 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
22739 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
22740}
22741
22742// Swap vselect operands where it may allow a predicated operation to achieve
22743// the `sel`.
22744//
22745// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
22746// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
22748 auto SelectA = N->getOperand(1);
22749 auto SelectB = N->getOperand(2);
22750 auto NTy = N->getValueType(0);
22751
22752 if (!NTy.isScalableVector())
22753 return SDValue();
22754 SDValue SetCC = N->getOperand(0);
22755 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
22756 return SDValue();
22757
22758 switch (SelectB.getOpcode()) {
22759 default:
22760 return SDValue();
22761 case ISD::FMUL:
22762 case ISD::FSUB:
22763 case ISD::FADD:
22764 break;
22765 }
22766 if (SelectA != SelectB.getOperand(0))
22767 return SDValue();
22768
22769 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
22770 ISD::CondCode InverseCC =
22772 auto InverseSetCC =
22773 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
22774 SetCC.getOperand(1), InverseCC);
22775
22776 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
22777 {InverseSetCC, SelectB, SelectA});
22778}
22779
22780// vselect (v1i1 setcc) ->
22781// vselect (v1iXX setcc) (XX is the size of the compared operand type)
22782// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
22783// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
22784// such VSELECT.
22786 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
22787 return SwapResult;
22788
22789 SDValue N0 = N->getOperand(0);
22790 EVT CCVT = N0.getValueType();
22791
22792 if (isAllActivePredicate(DAG, N0))
22793 return N->getOperand(1);
22794
22795 if (isAllInactivePredicate(N0))
22796 return N->getOperand(2);
22797
22798 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
22799 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
22800 // supported types.
22801 SDValue SetCC = N->getOperand(0);
22802 if (SetCC.getOpcode() == ISD::SETCC &&
22803 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
22804 SDValue CmpLHS = SetCC.getOperand(0);
22805 EVT VT = CmpLHS.getValueType();
22806 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
22807 SDNode *SplatLHS = N->getOperand(1).getNode();
22808 SDNode *SplatRHS = N->getOperand(2).getNode();
22809 APInt SplatLHSVal;
22810 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
22811 VT.isSimple() &&
22812 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
22813 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
22814 VT.getSimpleVT().SimpleTy) &&
22815 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
22816 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
22818 unsigned NumElts = VT.getVectorNumElements();
22820 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
22821 VT.getScalarType()));
22822 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
22823
22824 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
22825 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
22826 return Or;
22827 }
22828 }
22829
22830 EVT CmpVT = N0.getOperand(0).getValueType();
22831 if (N0.getOpcode() != ISD::SETCC ||
22833 CCVT.getVectorElementType() != MVT::i1 ||
22835 return SDValue();
22836
22837 EVT ResVT = N->getValueType(0);
22838 // Only combine when the result type is of the same size as the compared
22839 // operands.
22840 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
22841 return SDValue();
22842
22843 SDValue IfTrue = N->getOperand(1);
22844 SDValue IfFalse = N->getOperand(2);
22845 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
22846 N0.getOperand(0), N0.getOperand(1),
22847 cast<CondCodeSDNode>(N0.getOperand(2))->get());
22848 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
22849 IfTrue, IfFalse);
22850}
22851
22852/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
22853/// the compare-mask instructions rather than going via NZCV, even if LHS and
22854/// RHS are really scalar. This replaces any scalar setcc in the above pattern
22855/// with a vector one followed by a DUP shuffle on the result.
22858 SelectionDAG &DAG = DCI.DAG;
22859 SDValue N0 = N->getOperand(0);
22860 EVT ResVT = N->getValueType(0);
22861
22862 if (N0.getOpcode() != ISD::SETCC)
22863 return SDValue();
22864
22865 if (ResVT.isScalableVT())
22866 return SDValue();
22867
22868 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
22869 // scalar SetCCResultType. We also don't expect vectors, because we assume
22870 // that selects fed by vector SETCCs are canonicalized to VSELECT.
22871 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
22872 "Scalar-SETCC feeding SELECT has unexpected result type!");
22873
22874 // If NumMaskElts == 0, the comparison is larger than select result. The
22875 // largest real NEON comparison is 64-bits per lane, which means the result is
22876 // at most 32-bits and an illegal vector. Just bail out for now.
22877 EVT SrcVT = N0.getOperand(0).getValueType();
22878
22879 // Don't try to do this optimization when the setcc itself has i1 operands.
22880 // There are no legal vectors of i1, so this would be pointless. v1f16 is
22881 // ruled out to prevent the creation of setcc that need to be scalarized.
22882 if (SrcVT == MVT::i1 ||
22883 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
22884 return SDValue();
22885
22886 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
22887 if (!ResVT.isVector() || NumMaskElts == 0)
22888 return SDValue();
22889
22890 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
22892
22893 // Also bail out if the vector CCVT isn't the same size as ResVT.
22894 // This can happen if the SETCC operand size doesn't divide the ResVT size
22895 // (e.g., f64 vs v3f32).
22896 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
22897 return SDValue();
22898
22899 // Make sure we didn't create illegal types, if we're not supposed to.
22900 assert(DCI.isBeforeLegalize() ||
22901 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
22902
22903 // First perform a vector comparison, where lane 0 is the one we're interested
22904 // in.
22905 SDLoc DL(N0);
22906 SDValue LHS =
22907 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
22908 SDValue RHS =
22909 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
22910 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
22911
22912 // Now duplicate the comparison mask we want across all other lanes.
22913 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
22914 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
22915 Mask = DAG.getNode(ISD::BITCAST, DL,
22916 ResVT.changeVectorElementTypeToInteger(), Mask);
22917
22918 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
22919}
22920
22923 EVT VT = N->getValueType(0);
22924 SDLoc DL(N);
22925 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
22926 // 128bit vector version.
22927 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
22929 SmallVector<SDValue> Ops(N->ops());
22930 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
22931 DCI.DAG.getVTList(LVT), Ops)) {
22932 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
22933 DCI.DAG.getConstant(0, DL, MVT::i64));
22934 }
22935 }
22936
22937 if (N->getOpcode() == AArch64ISD::DUP) {
22938 if (DCI.isAfterLegalizeDAG()) {
22939 // If scalar dup's operand is extract_vector_elt, try to combine them into
22940 // duplane. For example,
22941 //
22942 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
22943 // t18: v4i32 = AArch64ISD::DUP t21
22944 // ==>
22945 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
22946 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
22947 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22948 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
22949 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
22950 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
22951 EXTRACT_VEC_ELT.getOperand(1));
22952 }
22953 }
22954 }
22955
22956 return performPostLD1Combine(N, DCI, false);
22957 }
22958
22959 return SDValue();
22960}
22961
22962/// Get rid of unnecessary NVCASTs (that don't change the type).
22964 if (N->getValueType(0) == N->getOperand(0).getValueType())
22965 return N->getOperand(0);
22966
22967 return SDValue();
22968}
22969
22970// If all users of the globaladdr are of the form (globaladdr + constant), find
22971// the smallest constant, fold it into the globaladdr's offset and rewrite the
22972// globaladdr as (globaladdr + constant) - constant.
22974 const AArch64Subtarget *Subtarget,
22975 const TargetMachine &TM) {
22976 auto *GN = cast<GlobalAddressSDNode>(N);
22977 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
22979 return SDValue();
22980
22981 uint64_t MinOffset = -1ull;
22982 for (SDNode *N : GN->uses()) {
22983 if (N->getOpcode() != ISD::ADD)
22984 return SDValue();
22985 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
22986 if (!C)
22987 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
22988 if (!C)
22989 return SDValue();
22990 MinOffset = std::min(MinOffset, C->getZExtValue());
22991 }
22992 uint64_t Offset = MinOffset + GN->getOffset();
22993
22994 // Require that the new offset is larger than the existing one. Otherwise, we
22995 // can end up oscillating between two possible DAGs, for example,
22996 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
22997 if (Offset <= uint64_t(GN->getOffset()))
22998 return SDValue();
22999
23000 // Check whether folding this offset is legal. It must not go out of bounds of
23001 // the referenced object to avoid violating the code model, and must be
23002 // smaller than 2^20 because this is the largest offset expressible in all
23003 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
23004 // stores an immediate signed 21 bit offset.)
23005 //
23006 // This check also prevents us from folding negative offsets, which will end
23007 // up being treated in the same way as large positive ones. They could also
23008 // cause code model violations, and aren't really common enough to matter.
23009 if (Offset >= (1 << 20))
23010 return SDValue();
23011
23012 const GlobalValue *GV = GN->getGlobal();
23013 Type *T = GV->getValueType();
23014 if (!T->isSized() ||
23016 return SDValue();
23017
23018 SDLoc DL(GN);
23019 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
23020 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
23021 DAG.getConstant(MinOffset, DL, MVT::i64));
23022}
23023
23025 const AArch64Subtarget *Subtarget) {
23026 SDValue BR = N->getOperand(0);
23027 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
23028 !BR.getValueType().isScalarInteger())
23029 return SDValue();
23030
23031 SDLoc DL(N);
23032 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
23033}
23034
23035// Turns the vector of indices into a vector of byte offstes by scaling Offset
23036// by (BitWidth / 8).
23038 SDLoc DL, unsigned BitWidth) {
23039 assert(Offset.getValueType().isScalableVector() &&
23040 "This method is only for scalable vectors of offsets");
23041
23042 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
23043 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
23044
23045 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
23046}
23047
23048/// Check if the value of \p OffsetInBytes can be used as an immediate for
23049/// the gather load/prefetch and scatter store instructions with vector base and
23050/// immediate offset addressing mode:
23051///
23052/// [<Zn>.[S|D]{, #<imm>}]
23053///
23054/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23055inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
23056 unsigned ScalarSizeInBytes) {
23057 // The immediate is not a multiple of the scalar size.
23058 if (OffsetInBytes % ScalarSizeInBytes)
23059 return false;
23060
23061 // The immediate is out of range.
23062 if (OffsetInBytes / ScalarSizeInBytes > 31)
23063 return false;
23064
23065 return true;
23066}
23067
23068/// Check if the value of \p Offset represents a valid immediate for the SVE
23069/// gather load/prefetch and scatter store instructiona with vector base and
23070/// immediate offset addressing mode:
23071///
23072/// [<Zn>.[S|D]{, #<imm>}]
23073///
23074/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23076 unsigned ScalarSizeInBytes) {
23077 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
23078 return OffsetConst && isValidImmForSVEVecImmAddrMode(
23079 OffsetConst->getZExtValue(), ScalarSizeInBytes);
23080}
23081
23083 unsigned Opcode,
23084 bool OnlyPackedOffsets = true) {
23085 const SDValue Src = N->getOperand(2);
23086 const EVT SrcVT = Src->getValueType(0);
23087 assert(SrcVT.isScalableVector() &&
23088 "Scatter stores are only possible for SVE vectors");
23089
23090 SDLoc DL(N);
23091 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
23092
23093 // Make sure that source data will fit into an SVE register
23095 return SDValue();
23096
23097 // For FPs, ACLE only supports _packed_ single and double precision types.
23098 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
23099 if (SrcElVT.isFloatingPoint())
23100 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
23101 ((Opcode != AArch64ISD::SST1Q_PRED &&
23102 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
23103 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
23104 return SDValue();
23105
23106 // Depending on the addressing mode, this is either a pointer or a vector of
23107 // pointers (that fits into one register)
23108 SDValue Base = N->getOperand(4);
23109 // Depending on the addressing mode, this is either a single offset or a
23110 // vector of offsets (that fits into one register)
23111 SDValue Offset = N->getOperand(5);
23112
23113 // For "scalar + vector of indices", just scale the indices. This only
23114 // applies to non-temporal scatters because there's no instruction that takes
23115 // indicies.
23116 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
23117 Offset =
23119 Opcode = AArch64ISD::SSTNT1_PRED;
23120 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
23121 Offset =
23123 Opcode = AArch64ISD::SST1Q_PRED;
23124 }
23125
23126 // In the case of non-temporal gather loads there's only one SVE instruction
23127 // per data-size: "scalar + vector", i.e.
23128 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23129 // Since we do have intrinsics that allow the arguments to be in a different
23130 // order, we may need to swap them to match the spec.
23131 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
23132 Offset.getValueType().isVector())
23134
23135 // SST1_IMM requires that the offset is an immediate that is:
23136 // * a multiple of #SizeInBytes,
23137 // * in the range [0, 31 x #SizeInBytes],
23138 // where #SizeInBytes is the size in bytes of the stored items. For
23139 // immediates outside that range and non-immediate scalar offsets use SST1 or
23140 // SST1_UXTW instead.
23141 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
23143 SrcVT.getScalarSizeInBits() / 8)) {
23144 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23146 else
23147 Opcode = AArch64ISD::SST1_PRED;
23148
23150 }
23151 }
23152
23153 auto &TLI = DAG.getTargetLoweringInfo();
23154 if (!TLI.isTypeLegal(Base.getValueType()))
23155 return SDValue();
23156
23157 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
23158 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23159 // nxv2i64. Legalize accordingly.
23160 if (!OnlyPackedOffsets &&
23161 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23162 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23163
23164 if (!TLI.isTypeLegal(Offset.getValueType()))
23165 return SDValue();
23166
23167 // Source value type that is representable in hardware
23168 EVT HwSrcVt = getSVEContainerType(SrcVT);
23169
23170 // Keep the original type of the input data to store - this is needed to be
23171 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
23172 // FP values we want the integer equivalent, so just use HwSrcVt.
23173 SDValue InputVT = DAG.getValueType(SrcVT);
23174 if (SrcVT.isFloatingPoint())
23175 InputVT = DAG.getValueType(HwSrcVt);
23176
23177 SDVTList VTs = DAG.getVTList(MVT::Other);
23178 SDValue SrcNew;
23179
23180 if (Src.getValueType().isFloatingPoint())
23181 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
23182 else
23183 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
23184
23185 SDValue Ops[] = {N->getOperand(0), // Chain
23186 SrcNew,
23187 N->getOperand(3), // Pg
23188 Base,
23189 Offset,
23190 InputVT};
23191
23192 return DAG.getNode(Opcode, DL, VTs, Ops);
23193}
23194
23196 unsigned Opcode,
23197 bool OnlyPackedOffsets = true) {
23198 const EVT RetVT = N->getValueType(0);
23199 assert(RetVT.isScalableVector() &&
23200 "Gather loads are only possible for SVE vectors");
23201
23202 SDLoc DL(N);
23203
23204 // Make sure that the loaded data will fit into an SVE register
23206 return SDValue();
23207
23208 // Depending on the addressing mode, this is either a pointer or a vector of
23209 // pointers (that fits into one register)
23210 SDValue Base = N->getOperand(3);
23211 // Depending on the addressing mode, this is either a single offset or a
23212 // vector of offsets (that fits into one register)
23213 SDValue Offset = N->getOperand(4);
23214
23215 // For "scalar + vector of indices", scale the indices to obtain unscaled
23216 // offsets. This applies to non-temporal and quadword gathers, which do not
23217 // have an addressing mode with scaled offset.
23220 RetVT.getScalarSizeInBits());
23222 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
23224 RetVT.getScalarSizeInBits());
23226 }
23227
23228 // In the case of non-temporal gather loads and quadword gather loads there's
23229 // only one addressing mode : "vector + scalar", e.g.
23230 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23231 // Since we do have intrinsics that allow the arguments to be in a different
23232 // order, we may need to swap them to match the spec.
23233 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
23234 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
23235 Offset.getValueType().isVector())
23237
23238 // GLD{FF}1_IMM requires that the offset is an immediate that is:
23239 // * a multiple of #SizeInBytes,
23240 // * in the range [0, 31 x #SizeInBytes],
23241 // where #SizeInBytes is the size in bytes of the loaded items. For
23242 // immediates outside that range and non-immediate scalar offsets use
23243 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
23244 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
23247 RetVT.getScalarSizeInBits() / 8)) {
23248 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23249 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23252 else
23253 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23256
23258 }
23259 }
23260
23261 auto &TLI = DAG.getTargetLoweringInfo();
23262 if (!TLI.isTypeLegal(Base.getValueType()))
23263 return SDValue();
23264
23265 // Some gather load variants allow unpacked offsets, but only as nxv2i32
23266 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23267 // nxv2i64. Legalize accordingly.
23268 if (!OnlyPackedOffsets &&
23269 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23270 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23271
23272 // Return value type that is representable in hardware
23273 EVT HwRetVt = getSVEContainerType(RetVT);
23274
23275 // Keep the original output value type around - this is needed to be able to
23276 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
23277 // values we want the integer equivalent, so just use HwRetVT.
23278 SDValue OutVT = DAG.getValueType(RetVT);
23279 if (RetVT.isFloatingPoint())
23280 OutVT = DAG.getValueType(HwRetVt);
23281
23282 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
23283 SDValue Ops[] = {N->getOperand(0), // Chain
23284 N->getOperand(2), // Pg
23285 Base, Offset, OutVT};
23286
23287 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
23288 SDValue LoadChain = SDValue(Load.getNode(), 1);
23289
23290 if (RetVT.isInteger() && (RetVT != HwRetVt))
23291 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
23292
23293 // If the original return value was FP, bitcast accordingly. Doing it here
23294 // means that we can avoid adding TableGen patterns for FPs.
23295 if (RetVT.isFloatingPoint())
23296 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
23297
23298 return DAG.getMergeValues({Load, LoadChain}, DL);
23299}
23300
23301static SDValue
23303 SelectionDAG &DAG) {
23304 SDLoc DL(N);
23305 SDValue Src = N->getOperand(0);
23306 unsigned Opc = Src->getOpcode();
23307
23308 // Sign extend of an unsigned unpack -> signed unpack
23309 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
23310
23311 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
23313
23314 // Push the sign extend to the operand of the unpack
23315 // This is necessary where, for example, the operand of the unpack
23316 // is another unpack:
23317 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
23318 // ->
23319 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
23320 // ->
23321 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
23322 SDValue ExtOp = Src->getOperand(0);
23323 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
23324 EVT EltTy = VT.getVectorElementType();
23325 (void)EltTy;
23326
23327 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
23328 "Sign extending from an invalid type");
23329
23330 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
23331
23333 ExtOp, DAG.getValueType(ExtVT));
23334
23335 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
23336 }
23337
23338 if (DCI.isBeforeLegalizeOps())
23339 return SDValue();
23340
23342 return SDValue();
23343
23344 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
23345 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
23346 unsigned NewOpc;
23347 unsigned MemVTOpNum = 4;
23348 switch (Opc) {
23351 MemVTOpNum = 3;
23352 break;
23355 MemVTOpNum = 3;
23356 break;
23359 MemVTOpNum = 3;
23360 break;
23363 break;
23366 break;
23369 break;
23372 break;
23375 break;
23378 break;
23381 break;
23384 break;
23387 break;
23390 break;
23393 break;
23396 break;
23399 break;
23402 break;
23405 break;
23406 default:
23407 return SDValue();
23408 }
23409
23410 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
23411 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
23412
23413 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
23414 return SDValue();
23415
23416 EVT DstVT = N->getValueType(0);
23417 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
23418
23420 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
23421 Ops.push_back(Src->getOperand(I));
23422
23423 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
23424 DCI.CombineTo(N, ExtLoad);
23425 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
23426
23427 // Return N so it doesn't get rechecked
23428 return SDValue(N, 0);
23429}
23430
23431/// Legalize the gather prefetch (scalar + vector addressing mode) when the
23432/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
23433/// != nxv2i32) do not need legalization.
23435 const unsigned OffsetPos = 4;
23436 SDValue Offset = N->getOperand(OffsetPos);
23437
23438 // Not an unpacked vector, bail out.
23439 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
23440 return SDValue();
23441
23442 // Extend the unpacked offset vector to 64-bit lanes.
23443 SDLoc DL(N);
23444 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
23445 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
23446 // Replace the offset operand with the 64-bit one.
23447 Ops[OffsetPos] = Offset;
23448
23449 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
23450}
23451
23452/// Combines a node carrying the intrinsic
23453/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
23454/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
23455/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
23456/// sve gather prefetch instruction with vector plus immediate addressing mode.
23458 unsigned ScalarSizeInBytes) {
23459 const unsigned ImmPos = 4, OffsetPos = 3;
23460 // No need to combine the node if the immediate is valid...
23461 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
23462 return SDValue();
23463
23464 // ...otherwise swap the offset base with the offset...
23465 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
23466 std::swap(Ops[ImmPos], Ops[OffsetPos]);
23467 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
23468 // `aarch64_sve_prfb_gather_uxtw_index`.
23469 SDLoc DL(N);
23470 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
23471 MVT::i64);
23472
23473 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
23474}
23475
23476// Return true if the vector operation can guarantee only the first lane of its
23477// result contains data, with all bits in other lanes set to zero.
23479 switch (Op.getOpcode()) {
23480 default:
23481 return false;
23497 return true;
23498 }
23499}
23500
23502 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
23503 SDValue InsertVec = N->getOperand(0);
23504 SDValue InsertElt = N->getOperand(1);
23505 SDValue InsertIdx = N->getOperand(2);
23506
23507 // We only care about inserts into the first element...
23508 if (!isNullConstant(InsertIdx))
23509 return SDValue();
23510 // ...of a zero'd vector...
23512 return SDValue();
23513 // ...where the inserted data was previously extracted...
23514 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23515 return SDValue();
23516
23517 SDValue ExtractVec = InsertElt.getOperand(0);
23518 SDValue ExtractIdx = InsertElt.getOperand(1);
23519
23520 // ...from the first element of a vector.
23521 if (!isNullConstant(ExtractIdx))
23522 return SDValue();
23523
23524 // If we get here we are effectively trying to zero lanes 1-N of a vector.
23525
23526 // Ensure there's no type conversion going on.
23527 if (N->getValueType(0) != ExtractVec.getValueType())
23528 return SDValue();
23529
23530 if (!isLanes1toNKnownZero(ExtractVec))
23531 return SDValue();
23532
23533 // The explicit zeroing is redundant.
23534 return ExtractVec;
23535}
23536
23537static SDValue
23540 return Res;
23541
23542 return performPostLD1Combine(N, DCI, true);
23543}
23544
23546 EVT Ty = N->getValueType(0);
23547 if (Ty.isInteger())
23548 return SDValue();
23549
23552 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
23554 return SDValue();
23555
23556 SDLoc DL(N);
23557 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
23558 DL, ExtIntTy);
23559 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
23560 DL, ExtIntTy);
23561 SDValue Idx = N->getOperand(2);
23562 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
23563 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
23564 return DAG.getBitcast(Ty, Trunc);
23565}
23566
23569 const AArch64Subtarget *Subtarget) {
23570 SDValue N0 = N->getOperand(0);
23571 EVT VT = N->getValueType(0);
23572
23573 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
23574 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
23575 return SDValue();
23576
23577 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
23578 EVT EltVT = VT.getVectorElementType();
23579 return EltVT == MVT::f32 || EltVT == MVT::f64;
23580 };
23581
23582 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
23583 // We purposefully don't care about legality of the nodes here as we know
23584 // they can be split down into something legal.
23585 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
23586 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
23587 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
23588 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
23589 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
23590 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
23591 LN0->getChain(), LN0->getBasePtr(),
23592 N0.getValueType(), LN0->getMemOperand());
23593 DCI.CombineTo(N, ExtLoad);
23594 DCI.CombineTo(
23595 N0.getNode(),
23596 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
23597 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
23598 ExtLoad.getValue(1));
23599 return SDValue(N, 0); // Return N so it doesn't get rechecked!
23600 }
23601
23602 return SDValue();
23603}
23604
23606 const AArch64Subtarget *Subtarget) {
23607 EVT VT = N->getValueType(0);
23608
23609 // Don't expand for NEON, SVE2 or SME
23610 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
23611 return SDValue();
23612
23613 SDLoc DL(N);
23614
23615 SDValue Mask = N->getOperand(0);
23616 SDValue In1 = N->getOperand(1);
23617 SDValue In2 = N->getOperand(2);
23618
23619 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
23620 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
23621 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
23622 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
23623}
23624
23626 EVT VT = N->getValueType(0);
23627
23628 SDValue Insert = N->getOperand(0);
23629 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
23630 return SDValue();
23631
23632 if (!Insert.getOperand(0).isUndef())
23633 return SDValue();
23634
23635 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
23636 uint64_t IdxDupLane = N->getConstantOperandVal(1);
23637 if (IdxInsert != 0 || IdxDupLane != 0)
23638 return SDValue();
23639
23640 SDValue Bitcast = Insert.getOperand(1);
23641 if (Bitcast.getOpcode() != ISD::BITCAST)
23642 return SDValue();
23643
23644 SDValue Subvec = Bitcast.getOperand(0);
23645 EVT SubvecVT = Subvec.getValueType();
23646 if (!SubvecVT.is128BitVector())
23647 return SDValue();
23648 EVT NewSubvecVT =
23650
23651 SDLoc DL(N);
23652 SDValue NewInsert =
23653 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
23654 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
23655 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
23656 NewInsert, N->getOperand(1));
23657 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
23658}
23659
23660// Try to combine mull with uzp1.
23663 SelectionDAG &DAG) {
23664 if (DCI.isBeforeLegalizeOps())
23665 return SDValue();
23666
23667 SDValue LHS = N->getOperand(0);
23668 SDValue RHS = N->getOperand(1);
23669
23670 SDValue ExtractHigh;
23671 SDValue ExtractLow;
23672 SDValue TruncHigh;
23673 SDValue TruncLow;
23674 SDLoc DL(N);
23675
23676 // Check the operands are trunc and extract_high.
23678 RHS.getOpcode() == ISD::TRUNCATE) {
23679 TruncHigh = RHS;
23680 if (LHS.getOpcode() == ISD::BITCAST)
23681 ExtractHigh = LHS.getOperand(0);
23682 else
23683 ExtractHigh = LHS;
23685 LHS.getOpcode() == ISD::TRUNCATE) {
23686 TruncHigh = LHS;
23687 if (LHS.getOpcode() == ISD::BITCAST)
23688 ExtractHigh = RHS.getOperand(0);
23689 else
23690 ExtractHigh = RHS;
23691 } else
23692 return SDValue();
23693
23694 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
23695 // with uzp1.
23696 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
23697 SDValue TruncHighOp = TruncHigh.getOperand(0);
23698 EVT TruncHighOpVT = TruncHighOp.getValueType();
23699 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
23700 DAG.isSplatValue(TruncHighOp, false))
23701 return SDValue();
23702
23703 // Check there is other extract_high with same source vector.
23704 // For example,
23705 //
23706 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
23707 // t12: v4i16 = truncate t11
23708 // t31: v4i32 = AArch64ISD::SMULL t18, t12
23709 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
23710 // t16: v4i16 = truncate t15
23711 // t30: v4i32 = AArch64ISD::SMULL t23, t1
23712 //
23713 // This dagcombine assumes the two extract_high uses same source vector in
23714 // order to detect the pair of the mull. If they have different source vector,
23715 // this code will not work.
23716 bool HasFoundMULLow = true;
23717 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
23718 if (ExtractHighSrcVec->use_size() != 2)
23719 HasFoundMULLow = false;
23720
23721 // Find ExtractLow.
23722 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
23723 if (User == ExtractHigh.getNode())
23724 continue;
23725
23726 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23728 HasFoundMULLow = false;
23729 break;
23730 }
23731
23732 ExtractLow.setNode(User);
23733 }
23734
23735 if (!ExtractLow || !ExtractLow->hasOneUse())
23736 HasFoundMULLow = false;
23737
23738 // Check ExtractLow's user.
23739 if (HasFoundMULLow) {
23740 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
23741 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
23742 HasFoundMULLow = false;
23743 } else {
23744 if (ExtractLowUser->getOperand(0) == ExtractLow) {
23745 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
23746 TruncLow = ExtractLowUser->getOperand(1);
23747 else
23748 HasFoundMULLow = false;
23749 } else {
23750 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
23751 TruncLow = ExtractLowUser->getOperand(0);
23752 else
23753 HasFoundMULLow = false;
23754 }
23755 }
23756 }
23757
23758 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
23759 // with uzp1.
23760 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
23761 EVT TruncHighVT = TruncHigh.getValueType();
23762 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
23763 SDValue TruncLowOp =
23764 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
23765 EVT TruncLowOpVT = TruncLowOp.getValueType();
23766 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
23767 DAG.isSplatValue(TruncLowOp, false)))
23768 return SDValue();
23769
23770 // Create uzp1, extract_high and extract_low.
23771 if (TruncHighOpVT != UZP1VT)
23772 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
23773 if (TruncLowOpVT != UZP1VT)
23774 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
23775
23776 SDValue UZP1 =
23777 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
23778 SDValue HighIdxCst =
23779 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
23780 SDValue NewTruncHigh =
23781 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
23782 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
23783
23784 if (HasFoundMULLow) {
23785 EVT TruncLowVT = TruncLow.getValueType();
23786 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
23787 UZP1, ExtractLow.getOperand(1));
23788 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
23789 }
23790
23791 return SDValue(N, 0);
23792}
23793
23796 SelectionDAG &DAG) {
23797 if (SDValue Val =
23799 return Val;
23800
23801 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
23802 return Val;
23803
23804 return SDValue();
23805}
23806
23807static SDValue
23809 SelectionDAG &DAG) {
23810 // Let's do below transform.
23811 //
23812 // t34: v4i32 = AArch64ISD::UADDLV t2
23813 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
23814 // t7: i64 = zero_extend t35
23815 // t20: v1i64 = scalar_to_vector t7
23816 // ==>
23817 // t34: v4i32 = AArch64ISD::UADDLV t2
23818 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
23819 // t40: v1i64 = AArch64ISD::NVCAST t39
23820 if (DCI.isBeforeLegalizeOps())
23821 return SDValue();
23822
23823 EVT VT = N->getValueType(0);
23824 if (VT != MVT::v1i64)
23825 return SDValue();
23826
23827 SDValue ZEXT = N->getOperand(0);
23828 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
23829 return SDValue();
23830
23831 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
23832 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23833 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
23834 return SDValue();
23835
23836 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
23837 return SDValue();
23838
23839 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
23840 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
23841 UADDLV.getValueType() != MVT::v4i32 ||
23842 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
23843 return SDValue();
23844
23845 // Let's generate new sequence with AArch64ISD::NVCAST.
23846 SDLoc DL(N);
23847 SDValue EXTRACT_SUBVEC =
23848 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
23849 DAG.getConstant(0, DL, MVT::i64));
23850 SDValue NVCAST =
23851 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
23852
23853 return NVCAST;
23854}
23855
23857 DAGCombinerInfo &DCI) const {
23858 SelectionDAG &DAG = DCI.DAG;
23859 switch (N->getOpcode()) {
23860 default:
23861 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
23862 break;
23863 case ISD::VECREDUCE_AND:
23864 case ISD::VECREDUCE_OR:
23865 case ISD::VECREDUCE_XOR:
23866 return performVecReduceBitwiseCombine(N, DCI, DAG);
23867 case ISD::ADD:
23868 case ISD::SUB:
23869 return performAddSubCombine(N, DCI);
23870 case ISD::BUILD_VECTOR:
23871 return performBuildVectorCombine(N, DCI, DAG);
23872 case ISD::TRUNCATE:
23873 return performTruncateCombine(N, DAG);
23874 case AArch64ISD::ANDS:
23875 return performFlagSettingCombine(N, DCI, ISD::AND);
23876 case AArch64ISD::ADC:
23877 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
23878 return R;
23879 return foldADCToCINC(N, DAG);
23880 case AArch64ISD::SBC:
23881 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
23882 case AArch64ISD::ADCS:
23883 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
23884 return R;
23886 case AArch64ISD::SBCS:
23887 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
23888 return R;
23890 case ISD::XOR:
23891 return performXorCombine(N, DAG, DCI, Subtarget);
23892 case ISD::MUL:
23893 return performMulCombine(N, DAG, DCI, Subtarget);
23894 case ISD::SINT_TO_FP:
23895 case ISD::UINT_TO_FP:
23896 return performIntToFpCombine(N, DAG, Subtarget);
23897 case ISD::FP_TO_SINT:
23898 case ISD::FP_TO_UINT:
23901 return performFpToIntCombine(N, DAG, DCI, Subtarget);
23902 case ISD::FDIV:
23903 return performFDivCombine(N, DAG, DCI, Subtarget);
23904 case ISD::OR:
23905 return performORCombine(N, DCI, Subtarget, *this);
23906 case ISD::AND:
23907 return performANDCombine(N, DCI);
23908 case ISD::FADD:
23909 return performFADDCombine(N, DCI);
23911 return performIntrinsicCombine(N, DCI, Subtarget);
23912 case ISD::ANY_EXTEND:
23913 case ISD::ZERO_EXTEND:
23914 case ISD::SIGN_EXTEND:
23915 return performExtendCombine(N, DCI, DAG);
23917 return performSignExtendInRegCombine(N, DCI, DAG);
23919 return performConcatVectorsCombine(N, DCI, DAG);
23921 return performExtractSubvectorCombine(N, DCI, DAG);
23923 return performInsertSubvectorCombine(N, DCI, DAG);
23924 case ISD::SELECT:
23925 return performSelectCombine(N, DCI);
23926 case ISD::VSELECT:
23927 return performVSelectCombine(N, DCI.DAG);
23928 case ISD::SETCC:
23929 return performSETCCCombine(N, DCI, DAG);
23930 case ISD::LOAD:
23931 return performLOADCombine(N, DCI, DAG, Subtarget);
23932 case ISD::STORE:
23933 return performSTORECombine(N, DCI, DAG, Subtarget);
23934 case ISD::MSTORE:
23935 return performMSTORECombine(N, DCI, DAG, Subtarget);
23936 case ISD::MGATHER:
23937 case ISD::MSCATTER:
23938 return performMaskedGatherScatterCombine(N, DCI, DAG);
23939 case ISD::VECTOR_SPLICE:
23940 return performSVESpliceCombine(N, DAG);
23941 case ISD::FP_EXTEND:
23942 return performFPExtendCombine(N, DAG, DCI, Subtarget);
23943 case AArch64ISD::BRCOND:
23944 return performBRCONDCombine(N, DCI, DAG);
23945 case AArch64ISD::TBNZ:
23946 case AArch64ISD::TBZ:
23947 return performTBZCombine(N, DCI, DAG);
23948 case AArch64ISD::CSEL:
23949 return performCSELCombine(N, DCI, DAG);
23950 case AArch64ISD::DUP:
23955 return performDUPCombine(N, DCI);
23957 return performDupLane128Combine(N, DAG);
23958 case AArch64ISD::NVCAST:
23959 return performNVCASTCombine(N);
23960 case AArch64ISD::SPLICE:
23961 return performSpliceCombine(N, DAG);
23964 return performUnpackCombine(N, DAG, Subtarget);
23965 case AArch64ISD::UZP1:
23966 return performUzpCombine(N, DAG, Subtarget);
23968 return performSetccMergeZeroCombine(N, DCI);
23985 return performGLD1Combine(N, DAG);
23986 case AArch64ISD::VASHR:
23987 case AArch64ISD::VLSHR:
23988 return performVectorShiftCombine(N, *this, DCI);
23990 return performSunpkloCombine(N, DAG);
23991 case AArch64ISD::BSP:
23992 return performBSPExpandForSVE(N, DAG, Subtarget);
23994 return performInsertVectorEltCombine(N, DCI);
23996 return performExtractVectorEltCombine(N, DCI, Subtarget);
23997 case ISD::VECREDUCE_ADD:
23998 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
23999 case AArch64ISD::UADDV:
24000 return performUADDVCombine(N, DAG);
24001 case AArch64ISD::SMULL:
24002 case AArch64ISD::UMULL:
24003 case AArch64ISD::PMULL:
24004 return performMULLCombine(N, DCI, DAG);
24007 switch (N->getConstantOperandVal(1)) {
24008 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
24009 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
24010 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
24011 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
24012 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
24013 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
24014 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
24015 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
24016 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
24017 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
24018 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
24019 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
24020 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
24021 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
24022 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
24023 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
24025 case Intrinsic::aarch64_neon_ld2:
24026 case Intrinsic::aarch64_neon_ld3:
24027 case Intrinsic::aarch64_neon_ld4:
24028 case Intrinsic::aarch64_neon_ld1x2:
24029 case Intrinsic::aarch64_neon_ld1x3:
24030 case Intrinsic::aarch64_neon_ld1x4:
24031 case Intrinsic::aarch64_neon_ld2lane:
24032 case Intrinsic::aarch64_neon_ld3lane:
24033 case Intrinsic::aarch64_neon_ld4lane:
24034 case Intrinsic::aarch64_neon_ld2r:
24035 case Intrinsic::aarch64_neon_ld3r:
24036 case Intrinsic::aarch64_neon_ld4r:
24037 case Intrinsic::aarch64_neon_st2:
24038 case Intrinsic::aarch64_neon_st3:
24039 case Intrinsic::aarch64_neon_st4:
24040 case Intrinsic::aarch64_neon_st1x2:
24041 case Intrinsic::aarch64_neon_st1x3:
24042 case Intrinsic::aarch64_neon_st1x4:
24043 case Intrinsic::aarch64_neon_st2lane:
24044 case Intrinsic::aarch64_neon_st3lane:
24045 case Intrinsic::aarch64_neon_st4lane:
24046 return performNEONPostLDSTCombine(N, DCI, DAG);
24047 case Intrinsic::aarch64_sve_ldnt1:
24048 return performLDNT1Combine(N, DAG);
24049 case Intrinsic::aarch64_sve_ld1rq:
24050 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
24051 case Intrinsic::aarch64_sve_ld1ro:
24052 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
24053 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
24055 case Intrinsic::aarch64_sve_ldnt1_gather:
24057 case Intrinsic::aarch64_sve_ldnt1_gather_index:
24058 return performGatherLoadCombine(N, DAG,
24060 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
24062 case Intrinsic::aarch64_sve_ld1:
24064 case Intrinsic::aarch64_sve_ldnf1:
24066 case Intrinsic::aarch64_sve_ldff1:
24068 case Intrinsic::aarch64_sve_st1:
24069 return performST1Combine(N, DAG);
24070 case Intrinsic::aarch64_sve_stnt1:
24071 return performSTNT1Combine(N, DAG);
24072 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
24074 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
24076 case Intrinsic::aarch64_sve_stnt1_scatter:
24078 case Intrinsic::aarch64_sve_stnt1_scatter_index:
24080 case Intrinsic::aarch64_sve_ld1_gather:
24082 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
24083 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
24085 case Intrinsic::aarch64_sve_ld1q_gather_index:
24086 return performGatherLoadCombine(N, DAG,
24088 case Intrinsic::aarch64_sve_ld1_gather_index:
24089 return performGatherLoadCombine(N, DAG,
24091 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
24093 /*OnlyPackedOffsets=*/false);
24094 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
24096 /*OnlyPackedOffsets=*/false);
24097 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
24098 return performGatherLoadCombine(N, DAG,
24100 /*OnlyPackedOffsets=*/false);
24101 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
24102 return performGatherLoadCombine(N, DAG,
24104 /*OnlyPackedOffsets=*/false);
24105 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
24107 case Intrinsic::aarch64_sve_ldff1_gather:
24109 case Intrinsic::aarch64_sve_ldff1_gather_index:
24110 return performGatherLoadCombine(N, DAG,
24112 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
24113 return performGatherLoadCombine(N, DAG,
24115 /*OnlyPackedOffsets=*/false);
24116 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
24117 return performGatherLoadCombine(N, DAG,
24119 /*OnlyPackedOffsets=*/false);
24120 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
24121 return performGatherLoadCombine(N, DAG,
24123 /*OnlyPackedOffsets=*/false);
24124 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
24125 return performGatherLoadCombine(N, DAG,
24127 /*OnlyPackedOffsets=*/false);
24128 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
24129 return performGatherLoadCombine(N, DAG,
24131 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
24132 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
24134 case Intrinsic::aarch64_sve_st1q_scatter_index:
24136 case Intrinsic::aarch64_sve_st1_scatter:
24138 case Intrinsic::aarch64_sve_st1_scatter_index:
24140 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
24142 /*OnlyPackedOffsets=*/false);
24143 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
24145 /*OnlyPackedOffsets=*/false);
24146 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
24147 return performScatterStoreCombine(N, DAG,
24149 /*OnlyPackedOffsets=*/false);
24150 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
24151 return performScatterStoreCombine(N, DAG,
24153 /*OnlyPackedOffsets=*/false);
24154 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
24156 case Intrinsic::aarch64_rndr:
24157 case Intrinsic::aarch64_rndrrs: {
24158 unsigned IntrinsicID = N->getConstantOperandVal(1);
24159 auto Register =
24160 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
24161 : AArch64SysReg::RNDRRS);
24162 SDLoc DL(N);
24163 SDValue A = DAG.getNode(
24164 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
24165 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
24166 SDValue B = DAG.getNode(
24167 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
24168 DAG.getConstant(0, DL, MVT::i32),
24169 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
24170 return DAG.getMergeValues(
24171 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
24172 }
24173 case Intrinsic::aarch64_sme_ldr_zt:
24175 DAG.getVTList(MVT::Other), N->getOperand(0),
24176 N->getOperand(2), N->getOperand(3));
24177 case Intrinsic::aarch64_sme_str_zt:
24178 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
24179 DAG.getVTList(MVT::Other), N->getOperand(0),
24180 N->getOperand(2), N->getOperand(3));
24181 default:
24182 break;
24183 }
24184 break;
24185 case ISD::GlobalAddress:
24186 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
24187 case ISD::CTLZ:
24188 return performCTLZCombine(N, DAG, Subtarget);
24190 return performScalarToVectorCombine(N, DCI, DAG);
24191 }
24192 return SDValue();
24193}
24194
24195// Check if the return value is used as only a return value, as otherwise
24196// we can't perform a tail-call. In particular, we need to check for
24197// target ISD nodes that are returns and any other "odd" constructs
24198// that the generic analysis code won't necessarily catch.
24199bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
24200 SDValue &Chain) const {
24201 if (N->getNumValues() != 1)
24202 return false;
24203 if (!N->hasNUsesOfValue(1, 0))
24204 return false;
24205
24206 SDValue TCChain = Chain;
24207 SDNode *Copy = *N->use_begin();
24208 if (Copy->getOpcode() == ISD::CopyToReg) {
24209 // If the copy has a glue operand, we conservatively assume it isn't safe to
24210 // perform a tail call.
24211 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
24212 MVT::Glue)
24213 return false;
24214 TCChain = Copy->getOperand(0);
24215 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
24216 return false;
24217
24218 bool HasRet = false;
24219 for (SDNode *Node : Copy->uses()) {
24220 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
24221 return false;
24222 HasRet = true;
24223 }
24224
24225 if (!HasRet)
24226 return false;
24227
24228 Chain = TCChain;
24229 return true;
24230}
24231
24232// Return whether the an instruction can potentially be optimized to a tail
24233// call. This will cause the optimizers to attempt to move, or duplicate,
24234// return instructions to help enable tail call optimizations for this
24235// instruction.
24236bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
24237 return CI->isTailCall();
24238}
24239
24240bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
24241 Register Offset, bool IsPre,
24242 MachineRegisterInfo &MRI) const {
24243 auto CstOffset = getIConstantVRegVal(Offset, MRI);
24244 if (!CstOffset || CstOffset->isZero())
24245 return false;
24246
24247 // All of the indexed addressing mode instructions take a signed 9 bit
24248 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
24249 // encodes the sign/indexing direction.
24250 return isInt<9>(CstOffset->getSExtValue());
24251}
24252
24253bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
24254 SDValue &Base,
24255 SDValue &Offset,
24256 SelectionDAG &DAG) const {
24257 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
24258 return false;
24259
24260 // Non-null if there is exactly one user of the loaded value (ignoring chain).
24261 SDNode *ValOnlyUser = nullptr;
24262 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
24263 ++UI) {
24264 if (UI.getUse().getResNo() == 1)
24265 continue; // Ignore chain.
24266 if (ValOnlyUser == nullptr)
24267 ValOnlyUser = *UI;
24268 else {
24269 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
24270 break;
24271 }
24272 }
24273
24274 auto IsUndefOrZero = [](SDValue V) {
24275 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
24276 };
24277
24278 // If the only user of the value is a scalable vector splat, it is
24279 // preferable to do a replicating load (ld1r*).
24280 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
24281 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
24282 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
24283 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
24284 return false;
24285
24286 Base = Op->getOperand(0);
24287 // All of the indexed addressing mode instructions take a signed
24288 // 9 bit immediate offset.
24289 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
24290 int64_t RHSC = RHS->getSExtValue();
24291 if (Op->getOpcode() == ISD::SUB)
24292 RHSC = -(uint64_t)RHSC;
24293 if (!isInt<9>(RHSC))
24294 return false;
24295 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
24296 // when dealing with subtraction.
24297 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
24298 return true;
24299 }
24300 return false;
24301}
24302
24303bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
24304 SDValue &Offset,
24306 SelectionDAG &DAG) const {
24307 EVT VT;
24308 SDValue Ptr;
24309 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
24310 VT = LD->getMemoryVT();
24311 Ptr = LD->getBasePtr();
24312 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
24313 VT = ST->getMemoryVT();
24314 Ptr = ST->getBasePtr();
24315 } else
24316 return false;
24317
24318 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
24319 return false;
24320 AM = ISD::PRE_INC;
24321 return true;
24322}
24323
24324bool AArch64TargetLowering::getPostIndexedAddressParts(
24326 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
24327 EVT VT;
24328 SDValue Ptr;
24329 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
24330 VT = LD->getMemoryVT();
24331 Ptr = LD->getBasePtr();
24332 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
24333 VT = ST->getMemoryVT();
24334 Ptr = ST->getBasePtr();
24335 } else
24336 return false;
24337
24338 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
24339 return false;
24340 // Post-indexing updates the base, so it's not a valid transform
24341 // if that's not the same as the load's pointer.
24342 if (Ptr != Base)
24343 return false;
24344 AM = ISD::POST_INC;
24345 return true;
24346}
24347
24350 SelectionDAG &DAG) {
24351 SDLoc DL(N);
24352 SDValue Op = N->getOperand(0);
24353 EVT VT = N->getValueType(0);
24354 [[maybe_unused]] EVT SrcVT = Op.getValueType();
24355 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
24356 "Must be bool vector.");
24357
24358 // Special handling for Clang's __builtin_convertvector. For vectors with <8
24359 // elements, it adds a vector concatenation with undef(s). If we encounter
24360 // this here, we can skip the concat.
24361 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
24362 bool AllUndef = true;
24363 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
24364 AllUndef &= Op.getOperand(I).isUndef();
24365
24366 if (AllUndef)
24367 Op = Op.getOperand(0);
24368 }
24369
24370 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
24371 if (VectorBits)
24372 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
24373}
24374
24377 SelectionDAG &DAG, EVT ExtendVT,
24378 EVT CastVT) {
24379 SDLoc DL(N);
24380 SDValue Op = N->getOperand(0);
24381 EVT VT = N->getValueType(0);
24382
24383 // Use SCALAR_TO_VECTOR for lane zero
24384 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
24385 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
24386 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
24387 Results.push_back(
24388 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
24389}
24390
24391void AArch64TargetLowering::ReplaceBITCASTResults(
24393 SDLoc DL(N);
24394 SDValue Op = N->getOperand(0);
24395 EVT VT = N->getValueType(0);
24396 EVT SrcVT = Op.getValueType();
24397
24398 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
24399 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
24400 return;
24401 }
24402
24403 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
24404 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
24405 return;
24406 }
24407
24408 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
24409 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
24410 return;
24411 }
24412
24413 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
24414 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
24415 "Expected fp->int bitcast!");
24416
24417 // Bitcasting between unpacked vector types of different element counts is
24418 // not a NOP because the live elements are laid out differently.
24419 // 01234567
24420 // e.g. nxv2i32 = XX??XX??
24421 // nxv4f16 = X?X?X?X?
24422 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
24423 return;
24424
24425 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
24426 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
24427 return;
24428 }
24429
24430 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
24431 !VT.isVector())
24432 return replaceBoolVectorBitcast(N, Results, DAG);
24433
24434 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
24435 return;
24436
24437 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
24438 DAG.getUNDEF(MVT::i32), Op);
24439 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
24440 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
24441}
24442
24444 SelectionDAG &DAG,
24445 const AArch64Subtarget *Subtarget) {
24446 EVT VT = N->getValueType(0);
24447 if (!VT.is256BitVector() ||
24449 !N->getFlags().hasAllowReassociation()) ||
24450 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()))
24451 return;
24452
24453 SDValue X = N->getOperand(0);
24454 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
24455 if (!Shuf) {
24456 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
24457 X = N->getOperand(1);
24458 if (!Shuf)
24459 return;
24460 }
24461
24462 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
24463 return;
24464
24465 // Check the mask is 1,0,3,2,5,4,...
24466 ArrayRef<int> Mask = Shuf->getMask();
24467 for (int I = 0, E = Mask.size(); I < E; I++)
24468 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
24469 return;
24470
24471 SDLoc DL(N);
24472 auto LoHi = DAG.SplitVector(X, DL);
24473 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
24474 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
24475 LoHi.first, LoHi.second);
24476
24477 // Shuffle the elements back into order.
24478 SmallVector<int> NMask;
24479 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
24480 NMask.push_back(I);
24481 NMask.push_back(I);
24482 }
24483 Results.push_back(
24484 DAG.getVectorShuffle(VT, DL,
24485 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
24486 DAG.getUNDEF(LoHi.first.getValueType())),
24487 DAG.getUNDEF(VT), NMask));
24488}
24489
24492 SelectionDAG &DAG, unsigned InterOp,
24493 unsigned AcrossOp) {
24494 EVT LoVT, HiVT;
24495 SDValue Lo, Hi;
24496 SDLoc dl(N);
24497 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
24498 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
24499 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
24500 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
24501 Results.push_back(SplitVal);
24502}
24503
24504void AArch64TargetLowering::ReplaceExtractSubVectorResults(
24506 SDValue In = N->getOperand(0);
24507 EVT InVT = In.getValueType();
24508
24509 // Common code will handle these just fine.
24510 if (!InVT.isScalableVector() || !InVT.isInteger())
24511 return;
24512
24513 SDLoc DL(N);
24514 EVT VT = N->getValueType(0);
24515
24516 // The following checks bail if this is not a halving operation.
24517
24519
24520 if (InVT.getVectorElementCount() != (ResEC * 2))
24521 return;
24522
24523 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
24524 if (!CIndex)
24525 return;
24526
24527 unsigned Index = CIndex->getZExtValue();
24528 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
24529 return;
24530
24531 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
24532 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
24533
24534 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
24535 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
24536}
24537
24538// Create an even/odd pair of X registers holding integer value V.
24540 SDLoc dl(V.getNode());
24541 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
24542 if (DAG.getDataLayout().isBigEndian())
24543 std::swap (VLo, VHi);
24544 SDValue RegClass =
24545 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
24546 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
24547 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
24548 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
24549 return SDValue(
24550 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
24551}
24552
24555 SelectionDAG &DAG,
24556 const AArch64Subtarget *Subtarget) {
24557 assert(N->getValueType(0) == MVT::i128 &&
24558 "AtomicCmpSwap on types less than 128 should be legal");
24559
24560 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
24561 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
24562 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
24563 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
24564 SDValue Ops[] = {
24565 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
24566 createGPRPairNode(DAG, N->getOperand(3)), // Store value
24567 N->getOperand(1), // Ptr
24568 N->getOperand(0), // Chain in
24569 };
24570
24571 unsigned Opcode;
24572 switch (MemOp->getMergedOrdering()) {
24574 Opcode = AArch64::CASPX;
24575 break;
24577 Opcode = AArch64::CASPAX;
24578 break;
24580 Opcode = AArch64::CASPLX;
24581 break;
24584 Opcode = AArch64::CASPALX;
24585 break;
24586 default:
24587 llvm_unreachable("Unexpected ordering!");
24588 }
24589
24590 MachineSDNode *CmpSwap = DAG.getMachineNode(
24591 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
24592 DAG.setNodeMemRefs(CmpSwap, {MemOp});
24593
24594 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
24595 if (DAG.getDataLayout().isBigEndian())
24596 std::swap(SubReg1, SubReg2);
24597 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
24598 SDValue(CmpSwap, 0));
24599 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
24600 SDValue(CmpSwap, 0));
24601 Results.push_back(
24602 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
24603 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
24604 return;
24605 }
24606
24607 unsigned Opcode;
24608 switch (MemOp->getMergedOrdering()) {
24610 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
24611 break;
24613 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
24614 break;
24616 Opcode = AArch64::CMP_SWAP_128_RELEASE;
24617 break;
24620 Opcode = AArch64::CMP_SWAP_128;
24621 break;
24622 default:
24623 llvm_unreachable("Unexpected ordering!");
24624 }
24625
24626 SDLoc DL(N);
24627 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
24628 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
24629 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
24630 New.first, New.second, N->getOperand(0)};
24631 SDNode *CmpSwap = DAG.getMachineNode(
24632 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
24633 Ops);
24634 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
24635
24636 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
24637 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
24638 Results.push_back(SDValue(CmpSwap, 3));
24639}
24640
24641static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
24642 AtomicOrdering Ordering) {
24643 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
24644 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
24645 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
24646 // ATOMIC_LOAD_CLR at any point.
24647 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
24648 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
24649 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
24650 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
24651
24652 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
24653 // The operand will need to be XORed in a separate step.
24654 switch (Ordering) {
24656 return AArch64::LDCLRP;
24657 break;
24659 return AArch64::LDCLRPA;
24660 break;
24662 return AArch64::LDCLRPL;
24663 break;
24666 return AArch64::LDCLRPAL;
24667 break;
24668 default:
24669 llvm_unreachable("Unexpected ordering!");
24670 }
24671 }
24672
24673 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
24674 switch (Ordering) {
24676 return AArch64::LDSETP;
24677 break;
24679 return AArch64::LDSETPA;
24680 break;
24682 return AArch64::LDSETPL;
24683 break;
24686 return AArch64::LDSETPAL;
24687 break;
24688 default:
24689 llvm_unreachable("Unexpected ordering!");
24690 }
24691 }
24692
24693 if (ISDOpcode == ISD::ATOMIC_SWAP) {
24694 switch (Ordering) {
24696 return AArch64::SWPP;
24697 break;
24699 return AArch64::SWPPA;
24700 break;
24702 return AArch64::SWPPL;
24703 break;
24706 return AArch64::SWPPAL;
24707 break;
24708 default:
24709 llvm_unreachable("Unexpected ordering!");
24710 }
24711 }
24712
24713 llvm_unreachable("Unexpected ISDOpcode!");
24714}
24715
24718 SelectionDAG &DAG,
24719 const AArch64Subtarget *Subtarget) {
24720 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
24721 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
24722 // rather than the CASP instructions, because CASP has register classes for
24723 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
24724 // to present them as single operands. LSE128 instructions use the GPR64
24725 // register class (because the pair does not have to be sequential), like
24726 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
24727
24728 assert(N->getValueType(0) == MVT::i128 &&
24729 "AtomicLoadXXX on types less than 128 should be legal");
24730
24731 if (!Subtarget->hasLSE128())
24732 return;
24733
24734 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
24735 const SDValue &Chain = N->getOperand(0);
24736 const SDValue &Ptr = N->getOperand(1);
24737 const SDValue &Val128 = N->getOperand(2);
24738 std::pair<SDValue, SDValue> Val2x64 =
24739 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
24740
24741 const unsigned ISDOpcode = N->getOpcode();
24742 const unsigned MachineOpcode =
24743 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
24744
24745 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
24746 SDLoc dl(Val128);
24747 Val2x64.first =
24748 DAG.getNode(ISD::XOR, dl, MVT::i64,
24749 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
24750 Val2x64.second =
24751 DAG.getNode(ISD::XOR, dl, MVT::i64,
24752 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
24753 }
24754
24755 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
24756 if (DAG.getDataLayout().isBigEndian())
24757 std::swap(Ops[0], Ops[1]);
24758
24759 MachineSDNode *AtomicInst =
24760 DAG.getMachineNode(MachineOpcode, SDLoc(N),
24761 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
24762
24763 DAG.setNodeMemRefs(AtomicInst, {MemOp});
24764
24765 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
24766 if (DAG.getDataLayout().isBigEndian())
24767 std::swap(Lo, Hi);
24768
24769 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
24770 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
24771}
24772
24773void AArch64TargetLowering::ReplaceNodeResults(
24775 switch (N->getOpcode()) {
24776 default:
24777 llvm_unreachable("Don't know how to custom expand this");
24778 case ISD::BITCAST:
24779 ReplaceBITCASTResults(N, Results, DAG);
24780 return;
24781 case ISD::VECREDUCE_ADD:
24786 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
24787 return;
24788 case ISD::ADD:
24789 case ISD::FADD:
24790 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
24791 return;
24792
24793 case ISD::CTPOP:
24794 case ISD::PARITY:
24795 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
24796 Results.push_back(Result);
24797 return;
24798 case AArch64ISD::SADDV:
24800 return;
24801 case AArch64ISD::UADDV:
24803 return;
24804 case AArch64ISD::SMINV:
24806 return;
24807 case AArch64ISD::UMINV:
24809 return;
24810 case AArch64ISD::SMAXV:
24812 return;
24813 case AArch64ISD::UMAXV:
24815 return;
24816 case ISD::MULHS:
24818 Results.push_back(
24819 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
24820 return;
24821 case ISD::MULHU:
24823 Results.push_back(
24824 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
24825 return;
24826 case ISD::FP_TO_UINT:
24827 case ISD::FP_TO_SINT:
24830 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
24831 // Let normal code take care of it by not adding anything to Results.
24832 return;
24834 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
24835 return;
24837 assert(N->getValueType(0) != MVT::i128 &&
24838 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
24839 break;
24842 case ISD::ATOMIC_SWAP: {
24843 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
24844 "Expected 128-bit atomicrmw.");
24845 // These need custom type legalisation so we go directly to instruction.
24846 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
24847 return;
24848 }
24849 case ISD::ATOMIC_LOAD:
24850 case ISD::LOAD: {
24851 MemSDNode *LoadNode = cast<MemSDNode>(N);
24852 EVT MemVT = LoadNode->getMemoryVT();
24853 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
24854 // targets.
24855 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
24856 MemVT.getSizeInBits() == 256u &&
24857 (MemVT.getScalarSizeInBits() == 8u ||
24858 MemVT.getScalarSizeInBits() == 16u ||
24859 MemVT.getScalarSizeInBits() == 32u ||
24860 MemVT.getScalarSizeInBits() == 64u)) {
24861
24864 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
24865 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
24866 MVT::Other}),
24867 {LoadNode->getChain(), LoadNode->getBasePtr()},
24868 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
24869
24870 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
24871 Result.getValue(0), Result.getValue(1));
24872 Results.append({Pair, Result.getValue(2) /* Chain */});
24873 return;
24874 }
24875
24876 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
24877 LoadNode->getMemoryVT() != MVT::i128) {
24878 // Non-volatile or atomic loads are optimized later in AArch64's load/store
24879 // optimizer.
24880 return;
24881 }
24882
24883 if (SDValue(N, 0).getValueType() == MVT::i128) {
24884 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
24885 bool isLoadAcquire =
24887 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
24888
24889 if (isLoadAcquire)
24890 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
24891
24893 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
24894 {LoadNode->getChain(), LoadNode->getBasePtr()},
24895 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
24896
24897 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
24898
24899 SDValue Pair =
24900 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
24901 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
24902 Results.append({Pair, Result.getValue(2) /* Chain */});
24903 }
24904 return;
24905 }
24907 ReplaceExtractSubVectorResults(N, Results, DAG);
24908 return;
24911 // Custom lowering has been requested for INSERT_SUBVECTOR and
24912 // CONCAT_VECTORS -- but delegate to common code for result type
24913 // legalisation
24914 return;
24916 EVT VT = N->getValueType(0);
24917 assert((VT == MVT::i8 || VT == MVT::i16) &&
24918 "custom lowering for unexpected type");
24919
24920 Intrinsic::ID IntID =
24921 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
24922 switch (IntID) {
24923 default:
24924 return;
24925 case Intrinsic::aarch64_sve_clasta_n: {
24926 SDLoc DL(N);
24927 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
24928 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
24929 N->getOperand(1), Op2, N->getOperand(3));
24930 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
24931 return;
24932 }
24933 case Intrinsic::aarch64_sve_clastb_n: {
24934 SDLoc DL(N);
24935 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
24936 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
24937 N->getOperand(1), Op2, N->getOperand(3));
24938 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
24939 return;
24940 }
24941 case Intrinsic::aarch64_sve_lasta: {
24942 SDLoc DL(N);
24943 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
24944 N->getOperand(1), N->getOperand(2));
24945 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
24946 return;
24947 }
24948 case Intrinsic::aarch64_sve_lastb: {
24949 SDLoc DL(N);
24950 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
24951 N->getOperand(1), N->getOperand(2));
24952 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
24953 return;
24954 }
24955 }
24956 }
24957 case ISD::READ_REGISTER: {
24958 SDLoc DL(N);
24959 assert(N->getValueType(0) == MVT::i128 &&
24960 "READ_REGISTER custom lowering is only for 128-bit sysregs");
24961 SDValue Chain = N->getOperand(0);
24962 SDValue SysRegName = N->getOperand(1);
24963
24964 SDValue Result = DAG.getNode(
24965 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
24966 Chain, SysRegName);
24967
24968 // Sysregs are not endian. Result.getValue(0) always contains the lower half
24969 // of the 128-bit System Register value.
24970 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
24971 Result.getValue(0), Result.getValue(1));
24972 Results.push_back(Pair);
24973 Results.push_back(Result.getValue(2)); // Chain
24974 return;
24975 }
24976 }
24977}
24978
24980 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
24982 return true;
24983}
24984
24985unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
24986 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
24987 // reciprocal if there are three or more FDIVs.
24988 return 3;
24989}
24990
24993 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
24994 // v4i16, v2i32 instead of to promote.
24995 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
24996 VT == MVT::v1f32)
24997 return TypeWidenVector;
24998
25000}
25001
25002// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
25003// provided the address is 16-byte aligned.
25005 if (!Subtarget->hasLSE2())
25006 return false;
25007
25008 if (auto LI = dyn_cast<LoadInst>(I))
25009 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25010 LI->getAlign() >= Align(16);
25011
25012 if (auto SI = dyn_cast<StoreInst>(I))
25013 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25014 SI->getAlign() >= Align(16);
25015
25016 return false;
25017}
25018
25020 if (!Subtarget->hasLSE128())
25021 return false;
25022
25023 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
25024 // will clobber the two registers.
25025 if (const auto *SI = dyn_cast<StoreInst>(I))
25026 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25027 SI->getAlign() >= Align(16) &&
25028 (SI->getOrdering() == AtomicOrdering::Release ||
25029 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
25030
25031 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
25032 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25033 RMW->getAlign() >= Align(16) &&
25034 (RMW->getOperation() == AtomicRMWInst::Xchg ||
25035 RMW->getOperation() == AtomicRMWInst::And ||
25036 RMW->getOperation() == AtomicRMWInst::Or);
25037
25038 return false;
25039}
25040
25042 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
25043 return false;
25044
25045 if (auto LI = dyn_cast<LoadInst>(I))
25046 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25047 LI->getAlign() >= Align(16) &&
25048 LI->getOrdering() == AtomicOrdering::Acquire;
25049
25050 if (auto SI = dyn_cast<StoreInst>(I))
25051 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25052 SI->getAlign() >= Align(16) &&
25053 SI->getOrdering() == AtomicOrdering::Release;
25054
25055 return false;
25056}
25057
25059 const Instruction *I) const {
25061 return false;
25063 return false;
25065 return true;
25066 return false;
25067}
25068
25070 const Instruction *I) const {
25071 // Store-Release instructions only provide seq_cst guarantees when paired with
25072 // Load-Acquire instructions. MSVC CRT does not use these instructions to
25073 // implement seq_cst loads and stores, so we need additional explicit fences
25074 // after memory writes.
25075 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25076 return false;
25077
25078 switch (I->getOpcode()) {
25079 default:
25080 return false;
25081 case Instruction::AtomicCmpXchg:
25082 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
25084 case Instruction::AtomicRMW:
25085 return cast<AtomicRMWInst>(I)->getOrdering() ==
25087 case Instruction::Store:
25088 return cast<StoreInst>(I)->getOrdering() ==
25090 }
25091}
25092
25093// Loads and stores less than 128-bits are already atomic; ones above that
25094// are doomed anyway, so defer to the default libcall and blame the OS when
25095// things go wrong.
25098 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
25099 if (Size != 128)
25101 if (isOpSuitableForRCPC3(SI))
25103 if (isOpSuitableForLSE128(SI))
25105 if (isOpSuitableForLDPSTP(SI))
25108}
25109
25110// Loads and stores less than 128-bits are already atomic; ones above that
25111// are doomed anyway, so defer to the default libcall and blame the OS when
25112// things go wrong.
25115 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
25116
25117 if (Size != 128)
25119 if (isOpSuitableForRCPC3(LI))
25121 // No LSE128 loads
25122 if (isOpSuitableForLDPSTP(LI))
25124
25125 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25126 // implement atomicrmw without spilling. If the target address is also on the
25127 // stack and close enough to the spill slot, this can lead to a situation
25128 // where the monitor always gets cleared and the atomic operation can never
25129 // succeed. So at -O0 lower this operation to a CAS loop.
25130 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25132
25133 // Using CAS for an atomic load has a better chance of succeeding under high
25134 // contention situations. So use it if available.
25135 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
25137}
25138
25139// The "default" for integer RMW operations is to expand to an LL/SC loop.
25140// However, with the LSE instructions (or outline-atomics mode, which provides
25141// library routines in place of the LSE-instructions), we can directly emit many
25142// operations instead.
25143//
25144// Floating-point operations are always emitted to a cmpxchg loop, because they
25145// may trigger a trap which aborts an LLSC sequence.
25148 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
25149 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
25150
25151 if (AI->isFloatingPointOperation())
25153
25154 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
25158 if (CanUseLSE128)
25160
25161 // Nand is not supported in LSE.
25162 // Leave 128 bits to LLSC or CmpXChg.
25163 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
25164 if (Subtarget->hasLSE())
25166 if (Subtarget->outlineAtomics()) {
25167 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
25168 // Don't outline them unless
25169 // (1) high level <atomic> support approved:
25170 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
25171 // (2) low level libgcc and compiler-rt support implemented by:
25172 // min/max outline atomics helpers
25173 if (AI->getOperation() != AtomicRMWInst::Min &&
25178 }
25179 }
25180 }
25181
25182 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25183 // implement atomicrmw without spilling. If the target address is also on the
25184 // stack and close enough to the spill slot, this can lead to a situation
25185 // where the monitor always gets cleared and the atomic operation can never
25186 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
25187 // we have a single CAS instruction that can replace the loop.
25189 Subtarget->hasLSE())
25191
25193}
25194
25197 AtomicCmpXchgInst *AI) const {
25198 // If subtarget has LSE, leave cmpxchg intact for codegen.
25199 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
25201 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25202 // implement cmpxchg without spilling. If the address being exchanged is also
25203 // on the stack and close enough to the spill slot, this can lead to a
25204 // situation where the monitor always gets cleared and the atomic operation
25205 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
25206 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25208
25209 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
25210 // it.
25212 if (Size > 64)
25214
25216}
25217
25219 Type *ValueTy, Value *Addr,
25220 AtomicOrdering Ord) const {
25221 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25222 bool IsAcquire = isAcquireOrStronger(Ord);
25223
25224 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
25225 // intrinsic must return {i64, i64} and we have to recombine them into a
25226 // single i128 here.
25227 if (ValueTy->getPrimitiveSizeInBits() == 128) {
25229 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
25231
25232 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
25233
25234 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
25235 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
25236 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
25237 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
25238 return Builder.CreateOr(
25239 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
25240 }
25241
25242 Type *Tys[] = { Addr->getType() };
25244 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
25245 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
25246
25247 const DataLayout &DL = M->getDataLayout();
25248 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
25249 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
25250 CI->addParamAttr(
25251 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
25252 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
25253
25254 return Builder.CreateBitCast(Trunc, ValueTy);
25255}
25256
25258 IRBuilderBase &Builder) const {
25259 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25260 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
25261}
25262
25264 Value *Val, Value *Addr,
25265 AtomicOrdering Ord) const {
25266 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25267 bool IsRelease = isReleaseOrStronger(Ord);
25268
25269 // Since the intrinsics must have legal type, the i128 intrinsics take two
25270 // parameters: "i64, i64". We must marshal Val into the appropriate form
25271 // before the call.
25272 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
25274 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
25276 Type *Int64Ty = Type::getInt64Ty(M->getContext());
25277
25278 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
25279 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
25280 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
25281 }
25282
25284 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
25285 Type *Tys[] = { Addr->getType() };
25286 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
25287
25288 const DataLayout &DL = M->getDataLayout();
25289 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
25290 Val = Builder.CreateBitCast(Val, IntValTy);
25291
25292 CallInst *CI = Builder.CreateCall(
25293 Stxr, {Builder.CreateZExtOrBitCast(
25294 Val, Stxr->getFunctionType()->getParamType(0)),
25295 Addr});
25296 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
25297 Attribute::ElementType, Val->getType()));
25298 return CI;
25299}
25300
25302 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
25303 const DataLayout &DL) const {
25304 if (!Ty->isArrayTy()) {
25305 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
25306 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
25307 }
25308
25309 // All non aggregate members of the type must have the same type
25310 SmallVector<EVT> ValueVTs;
25311 ComputeValueVTs(*this, DL, Ty, ValueVTs);
25312 return all_equal(ValueVTs);
25313}
25314
25315bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
25316 EVT) const {
25317 return false;
25318}
25319
25320static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
25321 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
25322 Function *ThreadPointerFunc =
25323 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
25324 return IRB.CreatePointerCast(
25325 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
25326 Offset),
25327 IRB.getPtrTy(0));
25328}
25329
25331 // Android provides a fixed TLS slot for the stack cookie. See the definition
25332 // of TLS_SLOT_STACK_GUARD in
25333 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
25334 if (Subtarget->isTargetAndroid())
25335 return UseTlsOffset(IRB, 0x28);
25336
25337 // Fuchsia is similar.
25338 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
25339 if (Subtarget->isTargetFuchsia())
25340 return UseTlsOffset(IRB, -0x10);
25341
25343}
25344
25346 // MSVC CRT provides functionalities for stack protection.
25347 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
25348 // MSVC CRT has a global variable holding security cookie.
25349 M.getOrInsertGlobal("__security_cookie",
25350 PointerType::getUnqual(M.getContext()));
25351
25352 // MSVC CRT has a function to validate security cookie.
25353 FunctionCallee SecurityCheckCookie =
25354 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
25355 Type::getVoidTy(M.getContext()),
25356 PointerType::getUnqual(M.getContext()));
25357 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
25358 F->setCallingConv(CallingConv::Win64);
25359 F->addParamAttr(0, Attribute::AttrKind::InReg);
25360 }
25361 return;
25362 }
25364}
25365
25367 // MSVC CRT has a global variable holding security cookie.
25368 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25369 return M.getGlobalVariable("__security_cookie");
25371}
25372
25374 // MSVC CRT has a function to validate security cookie.
25375 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25376 return M.getFunction(Subtarget->getSecurityCheckCookieName());
25378}
25379
25380Value *
25382 // Android provides a fixed TLS slot for the SafeStack pointer. See the
25383 // definition of TLS_SLOT_SAFESTACK in
25384 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
25385 if (Subtarget->isTargetAndroid())
25386 return UseTlsOffset(IRB, 0x48);
25387
25388 // Fuchsia is similar.
25389 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
25390 if (Subtarget->isTargetFuchsia())
25391 return UseTlsOffset(IRB, -0x8);
25392
25394}
25395
25397 const Instruction &AndI) const {
25398 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
25399 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
25400 // may be beneficial to sink in other cases, but we would have to check that
25401 // the cmp would not get folded into the br to form a cbz for these to be
25402 // beneficial.
25403 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
25404 if (!Mask)
25405 return false;
25406 return Mask->getValue().isPowerOf2();
25407}
25408
25412 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
25413 SelectionDAG &DAG) const {
25414 // Does baseline recommend not to perform the fold by default?
25416 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
25417 return false;
25418 // Else, if this is a vector shift, prefer 'shl'.
25419 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
25420}
25421
25424 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
25426 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
25429 ExpansionFactor);
25430}
25431
25433 // Update IsSplitCSR in AArch64unctionInfo.
25434 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
25435 AFI->setIsSplitCSR(true);
25436}
25437
25439 MachineBasicBlock *Entry,
25440 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
25441 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
25442 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
25443 if (!IStart)
25444 return;
25445
25446 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
25447 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
25448 MachineBasicBlock::iterator MBBI = Entry->begin();
25449 for (const MCPhysReg *I = IStart; *I; ++I) {
25450 const TargetRegisterClass *RC = nullptr;
25451 if (AArch64::GPR64RegClass.contains(*I))
25452 RC = &AArch64::GPR64RegClass;
25453 else if (AArch64::FPR64RegClass.contains(*I))
25454 RC = &AArch64::FPR64RegClass;
25455 else
25456 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
25457
25458 Register NewVR = MRI->createVirtualRegister(RC);
25459 // Create copy from CSR to a virtual register.
25460 // FIXME: this currently does not emit CFI pseudo-instructions, it works
25461 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
25462 // nounwind. If we want to generalize this later, we may need to emit
25463 // CFI pseudo-instructions.
25464 assert(Entry->getParent()->getFunction().hasFnAttribute(
25465 Attribute::NoUnwind) &&
25466 "Function should be nounwind in insertCopiesSplitCSR!");
25467 Entry->addLiveIn(*I);
25468 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
25469 .addReg(*I);
25470
25471 // Insert the copy-back instructions right before the terminator.
25472 for (auto *Exit : Exits)
25473 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
25474 TII->get(TargetOpcode::COPY), *I)
25475 .addReg(NewVR);
25476 }
25477}
25478
25480 // Integer division on AArch64 is expensive. However, when aggressively
25481 // optimizing for code size, we prefer to use a div instruction, as it is
25482 // usually smaller than the alternative sequence.
25483 // The exception to this is vector division. Since AArch64 doesn't have vector
25484 // integer division, leaving the division as-is is a loss even in terms of
25485 // size, because it will have to be scalarized, while the alternative code
25486 // sequence can be performed in vector form.
25487 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
25488 return OptSize && !VT.isVector();
25489}
25490
25492 // We want inc-of-add for scalars and sub-of-not for vectors.
25493 return VT.isScalarInteger();
25494}
25495
25497 EVT VT) const {
25498 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
25499 // legalize.
25500 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
25501 return false;
25502 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
25503}
25504
25508 const TargetInstrInfo *TII) const {
25509 assert(MBBI->isCall() && MBBI->getCFIType() &&
25510 "Invalid call instruction for a KCFI check");
25511
25512 switch (MBBI->getOpcode()) {
25513 case AArch64::BLR:
25514 case AArch64::BLRNoIP:
25515 case AArch64::TCRETURNri:
25516 case AArch64::TCRETURNriBTI:
25517 break;
25518 default:
25519 llvm_unreachable("Unexpected CFI call opcode");
25520 }
25521
25522 MachineOperand &Target = MBBI->getOperand(0);
25523 assert(Target.isReg() && "Invalid target operand for an indirect call");
25524 Target.setIsRenamable(false);
25525
25526 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
25527 .addReg(Target.getReg())
25528 .addImm(MBBI->getCFIType())
25529 .getInstr();
25530}
25531
25533 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
25534}
25535
25536unsigned
25538 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
25539 return getPointerTy(DL).getSizeInBits();
25540
25541 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
25542}
25543
25544void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
25545 MachineFrameInfo &MFI = MF.getFrameInfo();
25546 // If we have any vulnerable SVE stack objects then the stack protector
25547 // needs to be placed at the top of the SVE stack area, as the SVE locals
25548 // are placed above the other locals, so we allocate it as if it were a
25549 // scalable vector.
25550 // FIXME: It may be worthwhile having a specific interface for this rather
25551 // than doing it here in finalizeLowering.
25552 if (MFI.hasStackProtectorIndex()) {
25553 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
25559 break;
25560 }
25561 }
25562 }
25565}
25566
25567// Unlike X86, we let frame lowering assign offsets to all catch objects.
25569 return false;
25570}
25571
25572bool AArch64TargetLowering::shouldLocalize(
25573 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
25574 auto &MF = *MI.getMF();
25575 auto &MRI = MF.getRegInfo();
25576 auto maxUses = [](unsigned RematCost) {
25577 // A cost of 1 means remats are basically free.
25578 if (RematCost == 1)
25579 return std::numeric_limits<unsigned>::max();
25580 if (RematCost == 2)
25581 return 2U;
25582
25583 // Remat is too expensive, only sink if there's one user.
25584 if (RematCost > 2)
25585 return 1U;
25586 llvm_unreachable("Unexpected remat cost");
25587 };
25588
25589 unsigned Opc = MI.getOpcode();
25590 switch (Opc) {
25591 case TargetOpcode::G_GLOBAL_VALUE: {
25592 // On Darwin, TLS global vars get selected into function calls, which
25593 // we don't want localized, as they can get moved into the middle of a
25594 // another call sequence.
25595 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
25596 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
25597 return false;
25598 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
25599 }
25600 case TargetOpcode::G_FCONSTANT:
25601 case TargetOpcode::G_CONSTANT: {
25602 const ConstantInt *CI;
25603 unsigned AdditionalCost = 0;
25604
25605 if (Opc == TargetOpcode::G_CONSTANT)
25606 CI = MI.getOperand(1).getCImm();
25607 else {
25608 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
25609 // We try to estimate cost of 32/64b fpimms, as they'll likely be
25610 // materialized as integers.
25611 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
25612 break;
25613 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
25614 bool OptForSize =
25617 OptForSize))
25618 return true; // Constant should be cheap.
25619 CI =
25620 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
25621 // FP materialization also costs an extra move, from gpr to fpr.
25622 AdditionalCost = 1;
25623 }
25624 APInt Imm = CI->getValue();
25627 assert(Cost.isValid() && "Expected a valid imm cost");
25628
25629 unsigned RematCost = *Cost.getValue();
25630 RematCost += AdditionalCost;
25631 Register Reg = MI.getOperand(0).getReg();
25632 unsigned MaxUses = maxUses(RematCost);
25633 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
25634 if (MaxUses == std::numeric_limits<unsigned>::max())
25635 --MaxUses;
25636 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
25637 }
25638 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
25639 // localizable.
25640 case AArch64::ADRP:
25641 case AArch64::G_ADD_LOW:
25642 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
25643 case TargetOpcode::G_PTR_ADD:
25644 return true;
25645 default:
25646 break;
25647 }
25649}
25650
25652 if (Inst.getType()->isScalableTy())
25653 return true;
25654
25655 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
25656 if (Inst.getOperand(i)->getType()->isScalableTy())
25657 return true;
25658
25659 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
25660 if (AI->getAllocatedType()->isScalableTy())
25661 return true;
25662 }
25663
25664 // Checks to allow the use of SME instructions
25665 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
25666 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
25667 auto CalleeAttrs = SMEAttrs(*Base);
25668 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
25669 CallerAttrs.requiresLazySave(CalleeAttrs))
25670 return true;
25671 }
25672 return false;
25673}
25674
25675// Return the largest legal scalable vector type that matches VT's element type.
25679 "Expected legal fixed length vector!");
25680 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
25681 default:
25682 llvm_unreachable("unexpected element type for SVE container");
25683 case MVT::i8:
25684 return EVT(MVT::nxv16i8);
25685 case MVT::i16:
25686 return EVT(MVT::nxv8i16);
25687 case MVT::i32:
25688 return EVT(MVT::nxv4i32);
25689 case MVT::i64:
25690 return EVT(MVT::nxv2i64);
25691 case MVT::f16:
25692 return EVT(MVT::nxv8f16);
25693 case MVT::f32:
25694 return EVT(MVT::nxv4f32);
25695 case MVT::f64:
25696 return EVT(MVT::nxv2f64);
25697 }
25698}
25699
25700// Return a PTRUE with active lanes corresponding to the extent of VT.
25702 EVT VT) {
25705 "Expected legal fixed length vector!");
25706
25707 std::optional<unsigned> PgPattern =
25709 assert(PgPattern && "Unexpected element count for SVE predicate");
25710
25711 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
25712 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
25713 // variants of instructions when available.
25714 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
25715 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
25716 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
25717 if (MaxSVESize && MinSVESize == MaxSVESize &&
25718 MaxSVESize == VT.getSizeInBits())
25719 PgPattern = AArch64SVEPredPattern::all;
25720
25721 MVT MaskVT;
25722 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
25723 default:
25724 llvm_unreachable("unexpected element type for SVE predicate");
25725 case MVT::i8:
25726 MaskVT = MVT::nxv16i1;
25727 break;
25728 case MVT::i16:
25729 case MVT::f16:
25730 MaskVT = MVT::nxv8i1;
25731 break;
25732 case MVT::i32:
25733 case MVT::f32:
25734 MaskVT = MVT::nxv4i1;
25735 break;
25736 case MVT::i64:
25737 case MVT::f64:
25738 MaskVT = MVT::nxv2i1;
25739 break;
25740 }
25741
25742 return getPTrue(DAG, DL, MaskVT, *PgPattern);
25743}
25744
25746 EVT VT) {
25748 "Expected legal scalable vector!");
25749 auto PredTy = VT.changeVectorElementType(MVT::i1);
25750 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
25751}
25752
25754 if (VT.isFixedLengthVector())
25755 return getPredicateForFixedLengthVector(DAG, DL, VT);
25756
25757 return getPredicateForScalableVector(DAG, DL, VT);
25758}
25759
25760// Grow V to consume an entire SVE register.
25762 assert(VT.isScalableVector() &&
25763 "Expected to convert into a scalable vector!");
25764 assert(V.getValueType().isFixedLengthVector() &&
25765 "Expected a fixed length vector operand!");
25766 SDLoc DL(V);
25767 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
25768 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
25769}
25770
25771// Shrink V so it's just big enough to maintain a VT's worth of data.
25774 "Expected to convert into a fixed length vector!");
25775 assert(V.getValueType().isScalableVector() &&
25776 "Expected a scalable vector operand!");
25777 SDLoc DL(V);
25778 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
25779 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
25780}
25781
25782// Convert all fixed length vector loads larger than NEON to masked_loads.
25783SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
25784 SDValue Op, SelectionDAG &DAG) const {
25785 auto Load = cast<LoadSDNode>(Op);
25786
25787 SDLoc DL(Op);
25788 EVT VT = Op.getValueType();
25789 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
25790 EVT LoadVT = ContainerVT;
25791 EVT MemVT = Load->getMemoryVT();
25792
25793 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
25794
25795 if (VT.isFloatingPoint()) {
25796 LoadVT = ContainerVT.changeTypeToInteger();
25797 MemVT = MemVT.changeTypeToInteger();
25798 }
25799
25800 SDValue NewLoad = DAG.getMaskedLoad(
25801 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
25802 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
25803 Load->getAddressingMode(), Load->getExtensionType());
25804
25805 SDValue Result = NewLoad;
25806 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
25807 EVT ExtendVT = ContainerVT.changeVectorElementType(
25808 Load->getMemoryVT().getVectorElementType());
25809
25810 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
25812 Pg, Result, DAG.getUNDEF(ContainerVT));
25813 } else if (VT.isFloatingPoint()) {
25814 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
25815 }
25816
25817 Result = convertFromScalableVector(DAG, VT, Result);
25818 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
25819 return DAG.getMergeValues(MergedValues, DL);
25820}
25821
25823 SelectionDAG &DAG) {
25824 SDLoc DL(Mask);
25825 EVT InVT = Mask.getValueType();
25826 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
25827
25828 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
25829
25830 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
25831 return Pg;
25832
25833 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
25834 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
25835
25837 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
25838}
25839
25840// Convert all fixed length vector loads larger than NEON to masked_loads.
25841SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
25842 SDValue Op, SelectionDAG &DAG) const {
25843 auto Load = cast<MaskedLoadSDNode>(Op);
25844
25845 SDLoc DL(Op);
25846 EVT VT = Op.getValueType();
25847 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
25848
25849 SDValue Mask = Load->getMask();
25850 // If this is an extending load and the mask type is not the same as
25851 // load's type then we have to extend the mask type.
25852 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
25853 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
25854 "Incorrect mask type");
25855 Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
25856 }
25858
25859 SDValue PassThru;
25860 bool IsPassThruZeroOrUndef = false;
25861
25862 if (Load->getPassThru()->isUndef()) {
25863 PassThru = DAG.getUNDEF(ContainerVT);
25864 IsPassThruZeroOrUndef = true;
25865 } else {
25866 if (ContainerVT.isInteger())
25867 PassThru = DAG.getConstant(0, DL, ContainerVT);
25868 else
25869 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
25870 if (isZerosVector(Load->getPassThru().getNode()))
25871 IsPassThruZeroOrUndef = true;
25872 }
25873
25874 SDValue NewLoad = DAG.getMaskedLoad(
25875 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
25876 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
25877 Load->getAddressingMode(), Load->getExtensionType());
25878
25879 SDValue Result = NewLoad;
25880 if (!IsPassThruZeroOrUndef) {
25881 SDValue OldPassThru =
25882 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
25883 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
25884 }
25885
25886 Result = convertFromScalableVector(DAG, VT, Result);
25887 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
25888 return DAG.getMergeValues(MergedValues, DL);
25889}
25890
25891// Convert all fixed length vector stores larger than NEON to masked_stores.
25892SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
25893 SDValue Op, SelectionDAG &DAG) const {
25894 auto Store = cast<StoreSDNode>(Op);
25895
25896 SDLoc DL(Op);
25897 EVT VT = Store->getValue().getValueType();
25898 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
25899 EVT MemVT = Store->getMemoryVT();
25900
25901 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
25902 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
25903
25904 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
25905 EVT TruncVT = ContainerVT.changeVectorElementType(
25906 Store->getMemoryVT().getVectorElementType());
25907 MemVT = MemVT.changeTypeToInteger();
25908 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
25909 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
25910 DAG.getUNDEF(TruncVT));
25911 NewValue =
25912 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
25913 } else if (VT.isFloatingPoint()) {
25914 MemVT = MemVT.changeTypeToInteger();
25915 NewValue =
25916 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
25917 }
25918
25919 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
25920 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
25921 Store->getMemOperand(), Store->getAddressingMode(),
25922 Store->isTruncatingStore());
25923}
25924
25925SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
25926 SDValue Op, SelectionDAG &DAG) const {
25927 auto *Store = cast<MaskedStoreSDNode>(Op);
25928
25929 SDLoc DL(Op);
25930 EVT VT = Store->getValue().getValueType();
25931 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
25932
25933 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
25935
25936 return DAG.getMaskedStore(
25937 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
25938 Mask, Store->getMemoryVT(), Store->getMemOperand(),
25939 Store->getAddressingMode(), Store->isTruncatingStore());
25940}
25941
25942SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
25943 SDValue Op, SelectionDAG &DAG) const {
25944 SDLoc dl(Op);
25945 EVT VT = Op.getValueType();
25946 EVT EltVT = VT.getVectorElementType();
25947
25948 bool Signed = Op.getOpcode() == ISD::SDIV;
25949 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
25950
25951 bool Negated;
25952 uint64_t SplatVal;
25953 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
25954 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
25955 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
25956 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
25957
25958 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
25959 SDValue Res =
25960 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
25961 if (Negated)
25962 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
25963 DAG.getConstant(0, dl, ContainerVT), Res);
25964
25965 return convertFromScalableVector(DAG, VT, Res);
25966 }
25967
25968 // Scalable vector i32/i64 DIV is supported.
25969 if (EltVT == MVT::i32 || EltVT == MVT::i64)
25970 return LowerToPredicatedOp(Op, DAG, PredOpcode);
25971
25972 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
25973 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
25974 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
25975 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
25976
25977 // If the wider type is legal: extend, op, and truncate.
25978 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
25979 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
25980 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
25981 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
25982 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
25983 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
25984 }
25985
25986 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
25987 &ExtendOpcode](SDValue Op) {
25988 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
25989 SDValue IdxHalf =
25990 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
25991 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
25992 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
25993 return std::pair<SDValue, SDValue>(
25994 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
25995 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
25996 };
25997
25998 // If wider type is not legal: split, extend, op, trunc and concat.
25999 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
26000 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
26001 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
26002 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
26003 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
26004 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
26005 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
26006}
26007
26008SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
26009 SDValue Op, SelectionDAG &DAG) const {
26010 EVT VT = Op.getValueType();
26011 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26012
26013 SDLoc DL(Op);
26014 SDValue Val = Op.getOperand(0);
26015 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26016 Val = convertToScalableVector(DAG, ContainerVT, Val);
26017
26018 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
26019 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
26020
26021 // Repeatedly unpack Val until the result is of the desired element type.
26022 switch (ContainerVT.getSimpleVT().SimpleTy) {
26023 default:
26024 llvm_unreachable("unimplemented container type");
26025 case MVT::nxv16i8:
26026 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
26027 if (VT.getVectorElementType() == MVT::i16)
26028 break;
26029 [[fallthrough]];
26030 case MVT::nxv8i16:
26031 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
26032 if (VT.getVectorElementType() == MVT::i32)
26033 break;
26034 [[fallthrough]];
26035 case MVT::nxv4i32:
26036 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
26037 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
26038 break;
26039 }
26040
26041 return convertFromScalableVector(DAG, VT, Val);
26042}
26043
26044SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
26045 SDValue Op, SelectionDAG &DAG) const {
26046 EVT VT = Op.getValueType();
26047 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26048
26049 SDLoc DL(Op);
26050 SDValue Val = Op.getOperand(0);
26051 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26052 Val = convertToScalableVector(DAG, ContainerVT, Val);
26053
26054 // Repeatedly truncate Val until the result is of the desired element type.
26055 switch (ContainerVT.getSimpleVT().SimpleTy) {
26056 default:
26057 llvm_unreachable("unimplemented container type");
26058 case MVT::nxv2i64:
26059 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
26060 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
26061 if (VT.getVectorElementType() == MVT::i32)
26062 break;
26063 [[fallthrough]];
26064 case MVT::nxv4i32:
26065 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
26066 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
26067 if (VT.getVectorElementType() == MVT::i16)
26068 break;
26069 [[fallthrough]];
26070 case MVT::nxv8i16:
26071 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
26072 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
26073 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
26074 break;
26075 }
26076
26077 return convertFromScalableVector(DAG, VT, Val);
26078}
26079
26080SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
26081 SDValue Op, SelectionDAG &DAG) const {
26082 EVT VT = Op.getValueType();
26083 EVT InVT = Op.getOperand(0).getValueType();
26084 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
26085
26086 SDLoc DL(Op);
26087 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26088 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26089
26090 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
26091}
26092
26093SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
26094 SDValue Op, SelectionDAG &DAG) const {
26095 EVT VT = Op.getValueType();
26096 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26097
26098 SDLoc DL(Op);
26099 EVT InVT = Op.getOperand(0).getValueType();
26100 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26101 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26102
26103 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
26104 Op.getOperand(1), Op.getOperand(2));
26105
26106 return convertFromScalableVector(DAG, VT, ScalableRes);
26107}
26108
26109// Convert vector operation 'Op' to an equivalent predicated operation whereby
26110// the original operation's type is used to construct a suitable predicate.
26111// NOTE: The results for inactive lanes are undefined.
26112SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
26113 SelectionDAG &DAG,
26114 unsigned NewOp) const {
26115 EVT VT = Op.getValueType();
26116 SDLoc DL(Op);
26117 auto Pg = getPredicateForVector(DAG, DL, VT);
26118
26119 if (VT.isFixedLengthVector()) {
26120 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
26121 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26122
26123 // Create list of operands by converting existing ones to scalable types.
26125 for (const SDValue &V : Op->op_values()) {
26126 if (isa<CondCodeSDNode>(V)) {
26127 Operands.push_back(V);
26128 continue;
26129 }
26130
26131 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
26132 EVT VTArg = VTNode->getVT().getVectorElementType();
26133 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
26134 Operands.push_back(DAG.getValueType(NewVTArg));
26135 continue;
26136 }
26137
26138 assert(isTypeLegal(V.getValueType()) &&
26139 "Expected only legal fixed-width types");
26140 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
26141 }
26142
26143 if (isMergePassthruOpcode(NewOp))
26144 Operands.push_back(DAG.getUNDEF(ContainerVT));
26145
26146 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
26147 return convertFromScalableVector(DAG, VT, ScalableRes);
26148 }
26149
26150 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
26151
26153 for (const SDValue &V : Op->op_values()) {
26154 assert((!V.getValueType().isVector() ||
26155 V.getValueType().isScalableVector()) &&
26156 "Only scalable vectors are supported!");
26157 Operands.push_back(V);
26158 }
26159
26160 if (isMergePassthruOpcode(NewOp))
26161 Operands.push_back(DAG.getUNDEF(VT));
26162
26163 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
26164}
26165
26166// If a fixed length vector operation has no side effects when applied to
26167// undefined elements, we can safely use scalable vectors to perform the same
26168// operation without needing to worry about predication.
26169SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
26170 SelectionDAG &DAG) const {
26171 EVT VT = Op.getValueType();
26173 "Only expected to lower fixed length vector operation!");
26174 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26175
26176 // Create list of operands by converting existing ones to scalable types.
26178 for (const SDValue &V : Op->op_values()) {
26179 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
26180
26181 // Pass through non-vector operands.
26182 if (!V.getValueType().isVector()) {
26183 Ops.push_back(V);
26184 continue;
26185 }
26186
26187 // "cast" fixed length vector to a scalable vector.
26188 assert(V.getValueType().isFixedLengthVector() &&
26189 isTypeLegal(V.getValueType()) &&
26190 "Only fixed length vectors are supported!");
26191 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
26192 }
26193
26194 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
26195 return convertFromScalableVector(DAG, VT, ScalableRes);
26196}
26197
26198SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
26199 SelectionDAG &DAG) const {
26200 SDLoc DL(ScalarOp);
26201 SDValue AccOp = ScalarOp.getOperand(0);
26202 SDValue VecOp = ScalarOp.getOperand(1);
26203 EVT SrcVT = VecOp.getValueType();
26204 EVT ResVT = SrcVT.getVectorElementType();
26205
26206 EVT ContainerVT = SrcVT;
26207 if (SrcVT.isFixedLengthVector()) {
26208 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26209 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26210 }
26211
26212 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26213 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26214
26215 // Convert operands to Scalable.
26216 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
26217 DAG.getUNDEF(ContainerVT), AccOp, Zero);
26218
26219 // Perform reduction.
26220 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
26221 Pg, AccOp, VecOp);
26222
26223 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
26224}
26225
26226SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
26227 SelectionDAG &DAG) const {
26228 SDLoc DL(ReduceOp);
26229 SDValue Op = ReduceOp.getOperand(0);
26230 EVT OpVT = Op.getValueType();
26231 EVT VT = ReduceOp.getValueType();
26232
26233 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
26234 return SDValue();
26235
26236 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
26237
26238 switch (ReduceOp.getOpcode()) {
26239 default:
26240 return SDValue();
26241 case ISD::VECREDUCE_OR:
26242 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
26243 // The predicate can be 'Op' because
26244 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
26245 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
26246 else
26247 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
26248 case ISD::VECREDUCE_AND: {
26249 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
26250 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
26251 }
26252 case ISD::VECREDUCE_XOR: {
26253 SDValue ID =
26254 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
26255 if (OpVT == MVT::nxv1i1) {
26256 // Emulate a CNTP on .Q using .D and a different governing predicate.
26257 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
26258 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
26259 }
26260 SDValue Cntp =
26261 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
26262 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
26263 }
26264 }
26265
26266 return SDValue();
26267}
26268
26269SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
26270 SDValue ScalarOp,
26271 SelectionDAG &DAG) const {
26272 SDLoc DL(ScalarOp);
26273 SDValue VecOp = ScalarOp.getOperand(0);
26274 EVT SrcVT = VecOp.getValueType();
26275
26277 SrcVT,
26278 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
26279 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26280 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26281 }
26282
26283 // UADDV always returns an i64 result.
26284 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
26285 SrcVT.getVectorElementType();
26286 EVT RdxVT = SrcVT;
26287 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
26288 RdxVT = getPackedSVEVectorVT(ResVT);
26289
26290 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26291 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
26292 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
26293 Rdx, DAG.getConstant(0, DL, MVT::i64));
26294
26295 // The VEC_REDUCE nodes expect an element size result.
26296 if (ResVT != ScalarOp.getValueType())
26297 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
26298
26299 return Res;
26300}
26301
26302SDValue
26303AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
26304 SelectionDAG &DAG) const {
26305 EVT VT = Op.getValueType();
26306 SDLoc DL(Op);
26307
26308 EVT InVT = Op.getOperand(1).getValueType();
26309 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26310 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
26311 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
26312
26313 // Convert the mask to a predicated (NOTE: We don't need to worry about
26314 // inactive lanes since VSELECT is safe when given undefined elements).
26315 EVT MaskVT = Op.getOperand(0).getValueType();
26316 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
26317 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
26319 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
26320
26321 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
26322 Mask, Op1, Op2);
26323
26324 return convertFromScalableVector(DAG, VT, ScalableRes);
26325}
26326
26327SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
26328 SDValue Op, SelectionDAG &DAG) const {
26329 SDLoc DL(Op);
26330 EVT InVT = Op.getOperand(0).getValueType();
26331 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26332
26333 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
26334 "Only expected to lower fixed length vector operation!");
26335 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
26336 "Expected integer result of the same bit length as the inputs!");
26337
26338 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
26339 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
26340 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26341
26342 EVT CmpVT = Pg.getValueType();
26343 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
26344 {Pg, Op1, Op2, Op.getOperand(2)});
26345
26346 EVT PromoteVT = ContainerVT.changeTypeToInteger();
26347 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
26348 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
26349}
26350
26351SDValue
26352AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
26353 SelectionDAG &DAG) const {
26354 SDLoc DL(Op);
26355 auto SrcOp = Op.getOperand(0);
26356 EVT VT = Op.getValueType();
26357 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
26358 EVT ContainerSrcVT =
26359 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
26360
26361 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
26362 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
26363 return convertFromScalableVector(DAG, VT, Op);
26364}
26365
26366SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
26367 SDValue Op, SelectionDAG &DAG) const {
26368 SDLoc DL(Op);
26369 unsigned NumOperands = Op->getNumOperands();
26370
26371 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
26372 "Unexpected number of operands in CONCAT_VECTORS");
26373
26374 auto SrcOp1 = Op.getOperand(0);
26375 auto SrcOp2 = Op.getOperand(1);
26376 EVT VT = Op.getValueType();
26377 EVT SrcVT = SrcOp1.getValueType();
26378
26379 if (NumOperands > 2) {
26381 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
26382 for (unsigned I = 0; I < NumOperands; I += 2)
26383 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
26384 Op->getOperand(I), Op->getOperand(I + 1)));
26385
26386 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
26387 }
26388
26389 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26390
26392 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
26393 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
26394
26395 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
26396
26397 return convertFromScalableVector(DAG, VT, Op);
26398}
26399
26400SDValue
26401AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
26402 SelectionDAG &DAG) const {
26403 EVT VT = Op.getValueType();
26404 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26405
26406 SDLoc DL(Op);
26407 SDValue Val = Op.getOperand(0);
26408 SDValue Pg = getPredicateForVector(DAG, DL, VT);
26409 EVT SrcVT = Val.getValueType();
26410 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26411 EVT ExtendVT = ContainerVT.changeVectorElementType(
26412 SrcVT.getVectorElementType());
26413
26414 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
26415 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
26416
26417 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
26418 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
26419 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
26420 Pg, Val, DAG.getUNDEF(ContainerVT));
26421
26422 return convertFromScalableVector(DAG, VT, Val);
26423}
26424
26425SDValue
26426AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
26427 SelectionDAG &DAG) const {
26428 EVT VT = Op.getValueType();
26429 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26430
26431 SDLoc DL(Op);
26432 SDValue Val = Op.getOperand(0);
26433 EVT SrcVT = Val.getValueType();
26434 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
26435 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
26437 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
26438
26439 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
26440 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
26441 Op.getOperand(1), DAG.getUNDEF(RoundVT));
26442 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
26443 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
26444
26445 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
26446 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
26447}
26448
26449SDValue
26450AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
26451 SelectionDAG &DAG) const {
26452 EVT VT = Op.getValueType();
26453 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26454
26455 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
26456 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
26458
26459 SDLoc DL(Op);
26460 SDValue Val = Op.getOperand(0);
26461 EVT SrcVT = Val.getValueType();
26462 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
26463 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
26464
26465 if (VT.bitsGE(SrcVT)) {
26467
26468 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
26469 VT.changeTypeToInteger(), Val);
26470
26471 // Safe to use a larger than specified operand because by promoting the
26472 // value nothing has changed from an arithmetic point of view.
26473 Val =
26474 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
26475 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
26476 DAG.getUNDEF(ContainerDstVT));
26477 return convertFromScalableVector(DAG, VT, Val);
26478 } else {
26479 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
26480 ContainerDstVT.getVectorElementType());
26482
26483 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
26484 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
26485 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
26486 Val = convertFromScalableVector(DAG, SrcVT, Val);
26487
26488 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
26489 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
26490 }
26491}
26492
26493SDValue
26494AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
26495 SelectionDAG &DAG) const {
26496 SDLoc DL(Op);
26497 EVT OpVT = Op.getValueType();
26498 assert(OpVT.isScalableVector() &&
26499 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
26500 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
26501 Op.getOperand(1));
26502 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
26503 Op.getOperand(1));
26504 return DAG.getMergeValues({Even, Odd}, DL);
26505}
26506
26507SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
26508 SelectionDAG &DAG) const {
26509 SDLoc DL(Op);
26510 EVT OpVT = Op.getValueType();
26511 assert(OpVT.isScalableVector() &&
26512 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
26513
26514 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
26515 Op.getOperand(1));
26516 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
26517 Op.getOperand(1));
26518 return DAG.getMergeValues({Lo, Hi}, DL);
26519}
26520
26521SDValue
26522AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
26523 SelectionDAG &DAG) const {
26524 EVT VT = Op.getValueType();
26525 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26526
26527 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
26528 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
26530
26531 SDLoc DL(Op);
26532 SDValue Val = Op.getOperand(0);
26533 EVT SrcVT = Val.getValueType();
26534 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
26535 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
26536
26537 if (VT.bitsGT(SrcVT)) {
26538 EVT CvtVT = ContainerDstVT.changeVectorElementType(
26539 ContainerSrcVT.getVectorElementType());
26541
26542 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
26543 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
26544
26545 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
26546 Val = getSVESafeBitCast(CvtVT, Val, DAG);
26547 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
26548 DAG.getUNDEF(ContainerDstVT));
26549 return convertFromScalableVector(DAG, VT, Val);
26550 } else {
26551 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
26553
26554 // Safe to use a larger than specified result since an fp_to_int where the
26555 // result doesn't fit into the destination is undefined.
26556 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
26557 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
26558 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
26559
26560 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
26561 }
26562}
26563
26565 ArrayRef<int> ShuffleMask, EVT VT,
26566 EVT ContainerVT, SelectionDAG &DAG) {
26567 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26568 SDLoc DL(Op);
26569 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26570 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26571 bool IsSingleOp =
26572 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
26573
26574 if (!Subtarget.isNeonAvailable() && !MinSVESize)
26575 MinSVESize = 128;
26576
26577 // Ignore two operands if no SVE2 or all index numbers couldn't
26578 // be represented.
26579 if (!IsSingleOp && (!Subtarget.hasSVE2() || MinSVESize != MaxSVESize))
26580 return SDValue();
26581
26582 EVT VTOp1 = Op.getOperand(0).getValueType();
26583 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
26584 unsigned IndexLen = MinSVESize / BitsPerElt;
26585 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
26586 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
26587 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
26588 "Incorrectly legalised shuffle operation");
26589
26591 for (int Index : ShuffleMask) {
26592 // Handling poison index value.
26593 if (Index < 0)
26594 Index = 0;
26595 // If we refer to the second operand then we have to add elements
26596 // number in hardware register minus number of elements in a type.
26597 if ((unsigned)Index >= ElementsPerVectorReg)
26598 Index += IndexLen - ElementsPerVectorReg;
26599 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
26600 // to 255, this might point to the last element of in the second operand
26601 // of the shufflevector, thus we are rejecting this transform.
26602 if ((unsigned)Index >= MaxOffset)
26603 return SDValue();
26604 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
26605 }
26606
26607 // Choosing an out-of-range index leads to the lane being zeroed vs zero
26608 // value where it would perform first lane duplication for out of
26609 // index elements. For i8 elements an out-of-range index could be a valid
26610 // for 2048-bit vector register size.
26611 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i)
26612 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
26613
26614 EVT MaskEltType = EVT::getIntegerVT(*DAG.getContext(), BitsPerElt);
26615 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
26616 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
26617 SDValue VecMask =
26618 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
26619 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
26620
26621 SDValue Shuffle;
26622 if (IsSingleOp)
26623 Shuffle =
26624 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
26625 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
26626 Op1, SVEMask);
26627 else if (Subtarget.hasSVE2())
26628 Shuffle =
26629 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
26630 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
26631 Op1, Op2, SVEMask);
26632 else
26633 llvm_unreachable("Cannot lower shuffle without SVE2 TBL");
26634 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
26635 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
26636}
26637
26638SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
26639 SDValue Op, SelectionDAG &DAG) const {
26640 EVT VT = Op.getValueType();
26641 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26642
26643 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
26644 auto ShuffleMask = SVN->getMask();
26645
26646 SDLoc DL(Op);
26647 SDValue Op1 = Op.getOperand(0);
26648 SDValue Op2 = Op.getOperand(1);
26649
26650 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26651 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
26652 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
26653
26654 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
26655 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
26656 return MVT::i32;
26657 return ScalarTy;
26658 };
26659
26660 if (SVN->isSplat()) {
26661 unsigned Lane = std::max(0, SVN->getSplatIndex());
26662 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
26663 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
26664 DAG.getConstant(Lane, DL, MVT::i64));
26665 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
26666 return convertFromScalableVector(DAG, VT, Op);
26667 }
26668
26669 bool ReverseEXT = false;
26670 unsigned Imm;
26671 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
26672 Imm == VT.getVectorNumElements() - 1) {
26673 if (ReverseEXT)
26674 std::swap(Op1, Op2);
26675 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
26676 SDValue Scalar = DAG.getNode(
26677 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
26678 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
26679 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
26680 return convertFromScalableVector(DAG, VT, Op);
26681 }
26682
26683 for (unsigned LaneSize : {64U, 32U, 16U}) {
26684 if (isREVMask(ShuffleMask, VT, LaneSize)) {
26685 EVT NewVT =
26687 unsigned RevOp;
26688 unsigned EltSz = VT.getScalarSizeInBits();
26689 if (EltSz == 8)
26691 else if (EltSz == 16)
26693 else
26695
26696 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
26697 Op = LowerToPredicatedOp(Op, DAG, RevOp);
26698 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
26699 return convertFromScalableVector(DAG, VT, Op);
26700 }
26701 }
26702
26703 if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 &&
26704 isREVMask(ShuffleMask, VT, 128)) {
26705 if (!VT.isFloatingPoint())
26706 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
26707
26709 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
26710 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
26711 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
26712 return convertFromScalableVector(DAG, VT, Op);
26713 }
26714
26715 unsigned WhichResult;
26716 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
26718 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
26719
26720 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
26721 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
26723 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
26724 }
26725
26726 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
26728 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
26729
26730 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
26731 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
26733 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
26734 }
26735
26736 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
26737 // represents the same logical operation as performed by a ZIP instruction. In
26738 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
26739 // equivalent to an AArch64 instruction. There's the extra component of
26740 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
26741 // only operated on 64/128bit vector types that have a direct mapping to a
26742 // target register and so an exact mapping is implied.
26743 // However, when using SVE for fixed length vectors, most legal vector types
26744 // are actually sub-vectors of a larger SVE register. When mapping
26745 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
26746 // how the mask's indices translate. Specifically, when the mapping requires
26747 // an exact meaning for a specific vector index (e.g. Index X is the last
26748 // vector element in the register) then such mappings are often only safe when
26749 // the exact SVE register size is know. The main exception to this is when
26750 // indices are logically relative to the first element of either
26751 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
26752 // when converting from fixed-length to scalable vector types (i.e. the start
26753 // of a fixed length vector is always the start of a scalable vector).
26754 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
26755 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
26756 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
26757 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
26758 Op2.isUndef()) {
26759 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
26760 return convertFromScalableVector(DAG, VT, Op);
26761 }
26762
26763 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
26765 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
26766
26767 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
26768 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
26770 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
26771 }
26772
26773 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
26775 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
26776
26777 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
26778 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
26780 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
26781 }
26782 }
26783
26784 // Avoid producing TBL instruction if we don't know SVE register minimal size,
26785 // unless NEON is not available and we can assume minimal SVE register size is
26786 // 128-bits.
26787 if (MinSVESize || !Subtarget->isNeonAvailable())
26788 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
26789 DAG);
26790
26791 return SDValue();
26792}
26793
26794SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
26795 SelectionDAG &DAG) const {
26796 SDLoc DL(Op);
26797 EVT InVT = Op.getValueType();
26798
26799 assert(VT.isScalableVector() && isTypeLegal(VT) &&
26800 InVT.isScalableVector() && isTypeLegal(InVT) &&
26801 "Only expect to cast between legal scalable vector types!");
26802 assert(VT.getVectorElementType() != MVT::i1 &&
26803 InVT.getVectorElementType() != MVT::i1 &&
26804 "For predicate bitcasts, use getSVEPredicateBitCast");
26805
26806 if (InVT == VT)
26807 return Op;
26808
26810 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
26811
26812 // Safe bitcasting between unpacked vector types of different element counts
26813 // is currently unsupported because the following is missing the necessary
26814 // work to ensure the result's elements live where they're supposed to within
26815 // an SVE register.
26816 // 01234567
26817 // e.g. nxv2i32 = XX??XX??
26818 // nxv4f16 = X?X?X?X?
26820 VT == PackedVT || InVT == PackedInVT) &&
26821 "Unexpected bitcast!");
26822
26823 // Pack input if required.
26824 if (InVT != PackedInVT)
26825 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
26826
26827 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
26828
26829 // Unpack result if required.
26830 if (VT != PackedVT)
26832
26833 return Op;
26834}
26835
26837 SDValue N) const {
26838 return ::isAllActivePredicate(DAG, N);
26839}
26840
26842 return ::getPromotedVTForPredicate(VT);
26843}
26844
26845bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
26846 SDValue Op, const APInt &OriginalDemandedBits,
26847 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
26848 unsigned Depth) const {
26849
26850 unsigned Opc = Op.getOpcode();
26851 switch (Opc) {
26852 case AArch64ISD::VSHL: {
26853 // Match (VSHL (VLSHR Val X) X)
26854 SDValue ShiftL = Op;
26855 SDValue ShiftR = Op->getOperand(0);
26856 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
26857 return false;
26858
26859 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
26860 return false;
26861
26862 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
26863 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
26864
26865 // Other cases can be handled as well, but this is not
26866 // implemented.
26867 if (ShiftRBits != ShiftLBits)
26868 return false;
26869
26870 unsigned ScalarSize = Op.getScalarValueSizeInBits();
26871 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
26872
26873 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
26874 APInt UnusedBits = ~OriginalDemandedBits;
26875
26876 if ((ZeroBits & UnusedBits) != ZeroBits)
26877 return false;
26878
26879 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
26880 // used - simplify to just Val.
26881 return TLO.CombineTo(Op, ShiftR->getOperand(0));
26882 }
26884 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
26885 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
26886 if (!MaxSVEVectorSizeInBits)
26887 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
26888 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
26889 // The SVE count intrinsics don't support the multiplier immediate so we
26890 // don't have to account for that here. The value returned may be slightly
26891 // over the true required bits, as this is based on the "ALL" pattern. The
26892 // other patterns are also exposed by these intrinsics, but they all
26893 // return a value that's strictly less than "ALL".
26894 unsigned RequiredBits = llvm::bit_width(MaxElements);
26895 unsigned BitWidth = Known.Zero.getBitWidth();
26896 if (RequiredBits < BitWidth)
26897 Known.Zero.setHighBits(BitWidth - RequiredBits);
26898 return false;
26899 }
26900 }
26901 }
26902
26904 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
26905}
26906
26907bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
26908 return Op.getOpcode() == AArch64ISD::DUP ||
26909 Op.getOpcode() == AArch64ISD::MOVI ||
26910 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
26911 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
26913}
26914
26916 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
26917 Subtarget->hasComplxNum();
26918}
26919
26922 auto *VTy = dyn_cast<VectorType>(Ty);
26923 if (!VTy)
26924 return false;
26925
26926 // If the vector is scalable, SVE is enabled, implying support for complex
26927 // numbers. Otherwise, we need to ensure complex number support is available
26928 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
26929 return false;
26930
26931 auto *ScalarTy = VTy->getScalarType();
26932 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
26933
26934 // We can only process vectors that have a bit size of 128 or higher (with an
26935 // additional 64 bits for Neon). Additionally, these vectors must have a
26936 // power-of-2 size, as we later split them into the smallest supported size
26937 // and merging them back together after applying complex operation.
26938 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
26939 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
26940 !llvm::isPowerOf2_32(VTyWidth))
26941 return false;
26942
26943 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
26944 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
26945 return 8 <= ScalarWidth && ScalarWidth <= 64;
26946 }
26947
26948 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
26949 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
26950}
26951
26954 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
26955 Value *Accumulator) const {
26956 VectorType *Ty = cast<VectorType>(InputA->getType());
26957 bool IsScalable = Ty->isScalableTy();
26958 bool IsInt = Ty->getElementType()->isIntegerTy();
26959
26960 unsigned TyWidth =
26962
26963 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
26964 "Vector type must be either 64 or a power of 2 that is at least 128");
26965
26966 if (TyWidth > 128) {
26967 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
26968 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
26969 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
26970 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
26971 auto *UpperSplitA =
26972 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
26973 auto *UpperSplitB =
26974 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
26975 Value *LowerSplitAcc = nullptr;
26976 Value *UpperSplitAcc = nullptr;
26977 if (Accumulator) {
26978 LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
26979 UpperSplitAcc =
26980 B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
26981 }
26982 auto *LowerSplitInt = createComplexDeinterleavingIR(
26983 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
26984 auto *UpperSplitInt = createComplexDeinterleavingIR(
26985 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
26986
26987 auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
26988 B.getInt64(0));
26989 return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
26990 }
26991
26992 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
26993 if (Accumulator == nullptr)
26995
26996 if (IsScalable) {
26997 if (IsInt)
26998 return B.CreateIntrinsic(
26999 Intrinsic::aarch64_sve_cmla_x, Ty,
27000 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27001
27002 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27003 return B.CreateIntrinsic(
27004 Intrinsic::aarch64_sve_fcmla, Ty,
27005 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27006 }
27007
27008 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
27009 Intrinsic::aarch64_neon_vcmla_rot90,
27010 Intrinsic::aarch64_neon_vcmla_rot180,
27011 Intrinsic::aarch64_neon_vcmla_rot270};
27012
27013
27014 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
27015 {Accumulator, InputA, InputB});
27016 }
27017
27018 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
27019 if (IsScalable) {
27022 if (IsInt)
27023 return B.CreateIntrinsic(
27024 Intrinsic::aarch64_sve_cadd_x, Ty,
27025 {InputA, InputB, B.getInt32((int)Rotation * 90)});
27026
27027 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27028 return B.CreateIntrinsic(
27029 Intrinsic::aarch64_sve_fcadd, Ty,
27030 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
27031 }
27032 return nullptr;
27033 }
27034
27037 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
27039 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
27040
27041 if (IntId == Intrinsic::not_intrinsic)
27042 return nullptr;
27043
27044 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
27045 }
27046
27047 return nullptr;
27048}
27049
27050bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
27051 unsigned Opc = N->getOpcode();
27052 if (ISD::isExtOpcode(Opc)) {
27053 if (any_of(N->uses(),
27054 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
27055 return false;
27056 }
27057 return true;
27058}
27059
27060unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
27061 return Subtarget->getMinimumJumpTableEntries();
27062}
27063
27066 EVT VT) const {
27067 bool NonUnitFixedLengthVector =
27069 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27071
27072 EVT VT1;
27073 MVT RegisterVT;
27074 unsigned NumIntermediates;
27075 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
27076 RegisterVT);
27077 return RegisterVT;
27078}
27079
27081 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
27082 bool NonUnitFixedLengthVector =
27084 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27086
27087 EVT VT1;
27088 MVT VT2;
27089 unsigned NumIntermediates;
27091 NumIntermediates, VT2);
27092}
27093
27095 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
27096 unsigned &NumIntermediates, MVT &RegisterVT) const {
27098 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
27099 if (!RegisterVT.isFixedLengthVector() ||
27100 RegisterVT.getFixedSizeInBits() <= 128)
27101 return NumRegs;
27102
27103 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
27104 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
27105 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
27106
27107 // A size mismatch here implies either type promotion or widening and would
27108 // have resulted in scalarisation if larger vectors had not be available.
27109 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
27110 EVT EltTy = VT.getVectorElementType();
27112 if (!isTypeLegal(NewVT))
27113 NewVT = EltTy;
27114
27115 IntermediateVT = NewVT;
27116 NumIntermediates = VT.getVectorNumElements();
27117 RegisterVT = getRegisterType(Context, NewVT);
27118 return NumIntermediates;
27119 }
27120
27121 // SVE VLS support does not introduce a new ABI so we should use NEON sized
27122 // types for vector arguments and returns.
27123
27124 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
27125 NumIntermediates *= NumSubRegs;
27126 NumRegs *= NumSubRegs;
27127
27128 switch (RegisterVT.getVectorElementType().SimpleTy) {
27129 default:
27130 llvm_unreachable("unexpected element type for vector");
27131 case MVT::i8:
27132 IntermediateVT = RegisterVT = MVT::v16i8;
27133 break;
27134 case MVT::i16:
27135 IntermediateVT = RegisterVT = MVT::v8i16;
27136 break;
27137 case MVT::i32:
27138 IntermediateVT = RegisterVT = MVT::v4i32;
27139 break;
27140 case MVT::i64:
27141 IntermediateVT = RegisterVT = MVT::v2i64;
27142 break;
27143 case MVT::f16:
27144 IntermediateVT = RegisterVT = MVT::v8f16;
27145 break;
27146 case MVT::f32:
27147 IntermediateVT = RegisterVT = MVT::v4f32;
27148 break;
27149 case MVT::f64:
27150 IntermediateVT = RegisterVT = MVT::v2f64;
27151 break;
27152 case MVT::bf16:
27153 IntermediateVT = RegisterVT = MVT::v8bf16;
27154 break;
27155 }
27156
27157 return NumRegs;
27158}
27159
27161 const MachineFunction &MF) const {
27162 return !Subtarget->isTargetWindows() &&
27163 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
27164}
unsigned const MachineRegisterInfo * MRI
static unsigned MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static bool isREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point divide by power of two into fixed-point to floating-point conversion.
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool isUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static bool isTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N)
Get rid of unnecessary NVCASTs (that don't change the type).
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static bool isCMP(SDValue Op)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static bool isSplatShuffle(Value *V)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsLess, bool IsEqual)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isWideTypeMask(ArrayRef< int > M, EVT VT, SmallVectorImpl< int > &NewMask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static const unsigned PerfectShuffleTable[6561+1]
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
amdgpu AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:478
Symbol * Sym
Definition: ELF_riscv.cpp:477
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
static bool isSigned(unsigned int Opcode)
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define im(i)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
PowerPC Reduce CR logical Operation
const char LLVMTargetMachineRef TM
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
const AArch64RegisterInfo * getRegisterInfo() const override
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
const char * getChkStkName() const
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
unsigned getMinSVEVectorSizeInBits() const
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB, bool HasTile) const
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, LoadInst *LI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, StoreInst *SI) const override
Lower an interleave intrinsic to a target specific store intrinsic.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:207
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1485
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1896
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1364
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1433
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1934
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1138
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1941
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1583
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1696
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isMask(unsigned numBits) const
Definition: APInt.h:466
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:319
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1209
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1507
an instruction to allocate memory on the stack
Definition: Instructions.h:58
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:521
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:726
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:756
@ Or
*p = old | v
Definition: Instructions.h:750
@ And
*p = old & v
Definition: Instructions.h:746
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:754
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:760
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:758
@ Nand
*p = ~(old & v)
Definition: Instructions.h:748
bool isFloatingPointOperation() const
Definition: Instructions.h:897
BinOp getOperation() const
Definition: Instructions.h:820
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:92
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
const BlockAddress * getBlockAddress() const
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1426
unsigned arg_size() const
Definition: InstrTypes.h:1424
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1610
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:197
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:137
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1342
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:356
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:299
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:677
bool empty() const
Definition: Function.h:801
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:200
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:674
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:262
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1867
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:338
arg_iterator arg_end()
Definition: Function.h:819
arg_iterator arg_begin()
Definition: Function.h:810
size_t size() const
Definition: Function.h:800
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:341
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:666
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:263
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:528
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
Type * getValueType() const
Definition: GlobalValue.h:296
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2120
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2006
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1031
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2455
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1880
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2506
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1039
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:533
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2153
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1212
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2499
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:460
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:930
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2056
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2105
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1431
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:470
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2070
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:485
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2110
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1410
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2010
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2477
LLVMContext & getContext() const
Definition: IRBuilder.h:176
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2100
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1491
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:563
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2395
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1865
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:510
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2649
This instruction inserts a single (scalar) element into a VectorType value.
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:71
const BasicBlock * getParent() const
Definition: Instruction.h:139
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:93
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:75
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:239
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:257
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:92
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:193
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:177
Value * getPointerOperand()
Definition: Instructions.h:264
Type * getPointerOperandType() const
Definition: Instructions.h:267
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:581
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
void computeMaxCallFrameSize(const MachineFunction &MF)
Computes the maximum size of a callframe and the AdjustsStack property.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:275
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1743
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:586
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:720
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:730
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:826
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
void addCallSiteInfo(const SDNode *Node, CallSiteInfoImpl &&CallInfo)
Set CallSiteInfo to be associated with Node.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:480
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:659
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provides VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getStepVector(const SDLoc &DL, EVT ResVT, APInt StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:771
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, uint64_t Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:674
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:766
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:797
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:843
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:737
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:366
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:301
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:466
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:680
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:642
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:609
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:332
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:252
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:377
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1724
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:454
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:472
Type * getElementType() const
Definition: DerivedTypes.h:436
This class represents zero extension of integer types.
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:239
self_iterator getIterator()
Definition: ilist_node.h:109
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:262
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:218
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:221
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:235
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:156
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:252
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:257
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:236
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1124
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1120
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:476
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1331
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1362
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1153
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1233
@ STRICT_FCEIL
Definition: ISDOpcodes.h:426
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1029
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:436
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:543
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1347
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1351
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:688
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1361
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:477
@ STRICT_FLOG2
Definition: ISDOpcodes.h:421
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1259
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1260
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:937
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:411
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1383
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:885
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:662
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:450
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:101
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1344
@ WRITE_REGISTER
Definition: ISDOpcodes.h:119
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1213
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1348
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:986
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:758
@ STRICT_LROUND
Definition: ISDOpcodes.h:431
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:928
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1075
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:327
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1050
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1054
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:586
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:646
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ STRICT_FPOWI
Definition: ISDOpcodes.h:413
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1229
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1363
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:627
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1149
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:323
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:430
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1356
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:880
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1258
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1257
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:435
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:424
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:118
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1203
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:856
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:425
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:742
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1321
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1240
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:969
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1207
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:331
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1039
@ STRICT_LRINT
Definition: ISDOpcodes.h:433
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:591
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ STRICT_FROUND
Definition: ISDOpcodes.h:428
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:736
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:449
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1364
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:427
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:429
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1255
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:443
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:465
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:442
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:982
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1256
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1180
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:158
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:470
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1200
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:657
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:400
@ STRICT_FLOG10
Definition: ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:434
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:612
@ STRICT_FEXP2
Definition: ISDOpcodes.h:418
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1254
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:106
@ STRICT_LLROUND
Definition: ISDOpcodes.h:432
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:831
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:423
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:855
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1352
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1144
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1068
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:763
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:422
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:580
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:313
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1577
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1468
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1455
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1506
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1486
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1457
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1444
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:724
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:780
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:147
CastInst_match< OpTy, Instruction::ZExt > m_ZExt(const OpTy &Op)
Matches ZExt.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, Instruction::SExt > m_SExt(const OpTy &Op)
Matches SExt.
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
match_combine_or< CastInst_match< OpTy, Instruction::ZExt >, CastInst_match< OpTy, Instruction::SExt > > m_ZExtOrSExt(const OpTy &Op)
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:136
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
CodeModel::Model getCodeModel()
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< TypeSize > *Offsets, TypeSize StartingOffset)
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:122
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Offset
Definition: DWP.cpp:456
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:862
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1731
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:293
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:228
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:269
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1398
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:319
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:246
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:233
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2025
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1888
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2013
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:373
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:129
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:113
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:380
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:267
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:283
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:139
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:333
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:441
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:351
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:342
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:363
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:408
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:448
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:299
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:196
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:359
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:422
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:176
bool isFixedLengthVector() const
Definition: ValueTypes.h:170
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:160
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:306
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:275
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:201
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:239
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:166
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:311
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:149
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:319
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:431
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:144
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:191
Describes a register that needs to be forwarded from the prologue to a musttail call.
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:292
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:152
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:302
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:271
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:317
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:186
Structure used to represent pair of argument number after call lowering and register used to transfer...
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::OutputArg, 32 > Outs
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64